File size: 5,725 Bytes
0b46483
 
 
 
 
 
 
 
 
 
 
 
1d23a19
0b46483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import argparse
import ast
import torch
from PIL import Image
import time
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image, ImageDraw

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Qwen-VL Inference")
parser.add_argument("--image_path", type=str, required=False, default="./test_screenshots/amazon.png", help="Path to the input image")
parser.add_argument("--query", type=str, required=False, default="Click on the 'Chairs'.", help="Text query or instruction")
parser.add_argument("--model_dir", type=str, default="./", help="Path to the local ShowUI model directory")
args = parser.parse_args()

DEVICE = "cuda:0"

# Constants for the processor (adjust as per your model requirements)
MIN_PIXELS = 256 * 28 * 28
# MAX_PIXELS = 1024 * 28 * 28
# MAX_PIXELS = 1280 * 28 * 28
MAX_PIXELS = 1344 * 28 * 28


def draw_point_on_image(image_path, position, output_path="output_image.png", radius=2, color="red"):
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)
    width, height = image.size
    x = int(position[0] * width)
    y = int(position[1] * height)
    draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill=color, outline=color)
    image.save(output_path)
    # print(f"Point drawn at ({x}, {y}) and saved to {output_path}")

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    args.model_dir,
    torch_dtype=torch.float16,
    # torch_dtype=torch.bfloat16,
    device_map="cpu",
)

print("Model dtype:", model.dtype)
# print("Model weights dtype:", model.model.layers[0].self_attn.q_proj.weight)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)

# Print CUDA memory usage after model load
torch.cuda.empty_cache()

_NAV_SYSTEM = """You are an assistant trained to navigate the {_APP} screen. 
    Given a task instruction, a screen observation, and an action history sequence, 
    output the next action and wait for the next observation. 
    Here is the action space:
    {_ACTION_SPACE}
    """
    
_ACTION_MAP = """
        1. CLICK: Click on an element, value is not applicable and the position [x,y] is required. 
        2. INPUT: Type a string into an element, value is a string to type and the position [x,y] is required. 
        3. HOVER: Hover on an element, value is not applicable and the position [x,y] is required.
        4. ENTER: Enter operation, value and position are not applicable.
        5. SCROLL: Scroll the screen, value is the direction to scroll and the position is not applicable.
        6. ESC: ESCAPE operation, value and position are not applicable.
        7. PRESS: Long click on an element, value is not applicable and the position [x,y] is required. 
        """
        
_SYSTEM = _NAV_SYSTEM.format(
            _APP="web",
            _ACTION_SPACE=_ACTION_MAP
        )

if args.query:
    _QUERY = args.query
else:
    _QUERY = "Click on the 'Chairs'."

# Construct the input message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": _SYSTEM},
            {"type": "image", "image": args.image_path, "min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS},
            {"type": "text", "text": _QUERY}
        ],
    }
]

# Process the message through the processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# The processor automatically handles image loading when `images` parameter is used
image = Image.open(args.image_path).convert("RGB")
inputs = processor(
    text=[text],
    images=[image],
    padding=True,
    return_tensors="pt"
)

# Move inputs and model to GPU
model = model.to(DEVICE)
inputs = inputs.to(DEVICE)

# print("Model dtype after to(DEVICE):", model.model.layers[0].self_attn.q_proj.weight)
print(f"Max CUDA memory after model load: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")

# Reset memory stats before inference
torch.cuda.reset_peak_memory_stats(device=DEVICE)

N_RUNS = 10
times = []

model.eval()
with torch.no_grad():

    for i in range(N_RUNS):
        start_time = time.time()
        
        generated_ids = model.generate(**inputs, max_new_tokens=128)

        # Extract the portion of generated_ids corresponding to the new generation
        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]

        # Convert output string to Python object if it's a coordinate
        try:
            result = ast.literal_eval(output_text)
        except:
            result = output_text
            
        end_time = time.time()
        times.append(end_time - start_time)
        print(f"Run {i+1}/{N_RUNS} - Time: {end_time - start_time:.4f} s, Output: {result}")
        
        # output_str = "{'action': 'CLICK', 'value': None, 'position': [0.28, 0.29]}"
        # parsed_output = ast.literal_eval(result)
        if result['action'].upper() == 'CLICK':
            x, y = result['position'][0], result['position'][1]
            draw_point_on_image(args.image_path, [x, y], output_path="./output_image.png")
        


avg_time = sum(times) / len(times)
print(f"Average per inference time: {avg_time:.4f} seconds")

# Print CUDA memory usage after inference
print(f"Max CUDA memory after inference: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")

# Print image size information
print(f"Input image size: {Image.open(args.image_path).size}")