File size: 5,725 Bytes
0b46483 1d23a19 0b46483 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import argparse
import ast
import torch
from PIL import Image
import time
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image, ImageDraw
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Qwen-VL Inference")
parser.add_argument("--image_path", type=str, required=False, default="./test_screenshots/amazon.png", help="Path to the input image")
parser.add_argument("--query", type=str, required=False, default="Click on the 'Chairs'.", help="Text query or instruction")
parser.add_argument("--model_dir", type=str, default="./", help="Path to the local ShowUI model directory")
args = parser.parse_args()
DEVICE = "cuda:0"
# Constants for the processor (adjust as per your model requirements)
MIN_PIXELS = 256 * 28 * 28
# MAX_PIXELS = 1024 * 28 * 28
# MAX_PIXELS = 1280 * 28 * 28
MAX_PIXELS = 1344 * 28 * 28
def draw_point_on_image(image_path, position, output_path="output_image.png", radius=2, color="red"):
image = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(image)
width, height = image.size
x = int(position[0] * width)
y = int(position[1] * height)
draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill=color, outline=color)
image.save(output_path)
# print(f"Point drawn at ({x}, {y}) and saved to {output_path}")
# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
args.model_dir,
torch_dtype=torch.float16,
# torch_dtype=torch.bfloat16,
device_map="cpu",
)
print("Model dtype:", model.dtype)
# print("Model weights dtype:", model.model.layers[0].self_attn.q_proj.weight)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
# Print CUDA memory usage after model load
torch.cuda.empty_cache()
_NAV_SYSTEM = """You are an assistant trained to navigate the {_APP} screen.
Given a task instruction, a screen observation, and an action history sequence,
output the next action and wait for the next observation.
Here is the action space:
{_ACTION_SPACE}
"""
_ACTION_MAP = """
1. CLICK: Click on an element, value is not applicable and the position [x,y] is required.
2. INPUT: Type a string into an element, value is a string to type and the position [x,y] is required.
3. HOVER: Hover on an element, value is not applicable and the position [x,y] is required.
4. ENTER: Enter operation, value and position are not applicable.
5. SCROLL: Scroll the screen, value is the direction to scroll and the position is not applicable.
6. ESC: ESCAPE operation, value and position are not applicable.
7. PRESS: Long click on an element, value is not applicable and the position [x,y] is required.
"""
_SYSTEM = _NAV_SYSTEM.format(
_APP="web",
_ACTION_SPACE=_ACTION_MAP
)
if args.query:
_QUERY = args.query
else:
_QUERY = "Click on the 'Chairs'."
# Construct the input message
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": _SYSTEM},
{"type": "image", "image": args.image_path, "min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS},
{"type": "text", "text": _QUERY}
],
}
]
# Process the message through the processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# The processor automatically handles image loading when `images` parameter is used
image = Image.open(args.image_path).convert("RGB")
inputs = processor(
text=[text],
images=[image],
padding=True,
return_tensors="pt"
)
# Move inputs and model to GPU
model = model.to(DEVICE)
inputs = inputs.to(DEVICE)
# print("Model dtype after to(DEVICE):", model.model.layers[0].self_attn.q_proj.weight)
print(f"Max CUDA memory after model load: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")
# Reset memory stats before inference
torch.cuda.reset_peak_memory_stats(device=DEVICE)
N_RUNS = 10
times = []
model.eval()
with torch.no_grad():
for i in range(N_RUNS):
start_time = time.time()
generated_ids = model.generate(**inputs, max_new_tokens=128)
# Extract the portion of generated_ids corresponding to the new generation
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
# Convert output string to Python object if it's a coordinate
try:
result = ast.literal_eval(output_text)
except:
result = output_text
end_time = time.time()
times.append(end_time - start_time)
print(f"Run {i+1}/{N_RUNS} - Time: {end_time - start_time:.4f} s, Output: {result}")
# output_str = "{'action': 'CLICK', 'value': None, 'position': [0.28, 0.29]}"
# parsed_output = ast.literal_eval(result)
if result['action'].upper() == 'CLICK':
x, y = result['position'][0], result['position'][1]
draw_point_on_image(args.image_path, [x, y], output_path="./output_image.png")
avg_time = sum(times) / len(times)
print(f"Average per inference time: {avg_time:.4f} seconds")
# Print CUDA memory usage after inference
print(f"Max CUDA memory after inference: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")
# Print image size information
print(f"Input image size: {Image.open(args.image_path).size}")
|