File size: 5,725 Bytes

import argparse
import ast
import torch
from PIL import Image
import time
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image, ImageDraw

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Qwen-VL Inference")
parser.add_argument("--image_path", type=str, required=False, default="./test_screenshots/amazon.png", help="Path to the input image")
parser.add_argument("--query", type=str, required=False, default="Click on the 'Chairs'.", help="Text query or instruction")
parser.add_argument("--model_dir", type=str, default="./", help="Path to the local ShowUI model directory")
args = parser.parse_args()

DEVICE = "cuda:0"

# Constants for the processor (adjust as per your model requirements)
MIN_PIXELS = 256 * 28 * 28
# MAX_PIXELS = 1024 * 28 * 28
# MAX_PIXELS = 1280 * 28 * 28
MAX_PIXELS = 1344 * 28 * 28


def draw_point_on_image(image_path, position, output_path="output_image.png", radius=2, color="red"):
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)
    width, height = image.size
    x = int(position[0] * width)
    y = int(position[1] * height)
    draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill=color, outline=color)
    image.save(output_path)
    # print(f"Point drawn at ({x}, {y}) and saved to {output_path}")

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    args.model_dir,
    torch_dtype=torch.float16,
    # torch_dtype=torch.bfloat16,
    device_map="cpu",
)

print("Model dtype:", model.dtype)
# print("Model weights dtype:", model.model.layers[0].self_attn.q_proj.weight)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)

# Print CUDA memory usage after model load
torch.cuda.empty_cache()

_NAV_SYSTEM = """You are an assistant trained to navigate the {_APP} screen. 
    Given a task instruction, a screen observation, and an action history sequence, 
    output the next action and wait for the next observation. 
    Here is the action space:
    {_ACTION_SPACE}
    """
    
_ACTION_MAP = """
        1. CLICK: Click on an element, value is not applicable and the position [x,y] is required. 
        2. INPUT: Type a string into an element, value is a string to type and the position [x,y] is required. 
        3. HOVER: Hover on an element, value is not applicable and the position [x,y] is required.
        4. ENTER: Enter operation, value and position are not applicable.
        5. SCROLL: Scroll the screen, value is the direction to scroll and the position is not applicable.
        6. ESC: ESCAPE operation, value and position are not applicable.
        7. PRESS: Long click on an element, value is not applicable and the position [x,y] is required. 
        """
        
_SYSTEM = _NAV_SYSTEM.format(
            _APP="web",
            _ACTION_SPACE=_ACTION_MAP
        )

if args.query:
    _QUERY = args.query
else:
    _QUERY = "Click on the 'Chairs'."

# Construct the input message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": _SYSTEM},
            {"type": "image", "image": args.image_path, "min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS},
            {"type": "text", "text": _QUERY}
        ],
    }
]

# Process the message through the processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# The processor automatically handles image loading when `images` parameter is used
image = Image.open(args.image_path).convert("RGB")
inputs = processor(
    text=[text],
    images=[image],
    padding=True,
    return_tensors="pt"
)

# Move inputs and model to GPU
model = model.to(DEVICE)
inputs = inputs.to(DEVICE)

# print("Model dtype after to(DEVICE):", model.model.layers[0].self_attn.q_proj.weight)
print(f"Max CUDA memory after model load: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")

# Reset memory stats before inference
torch.cuda.reset_peak_memory_stats(device=DEVICE)

N_RUNS = 10
times = []

model.eval()
with torch.no_grad():

    for i in range(N_RUNS):
        start_time = time.time()
        
        generated_ids = model.generate(**inputs, max_new_tokens=128)

        # Extract the portion of generated_ids corresponding to the new generation
        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]

        # Convert output string to Python object if it's a coordinate
        try:
            result = ast.literal_eval(output_text)
        except:
            result = output_text
            
        end_time = time.time()
        times.append(end_time - start_time)
        print(f"Run {i+1}/{N_RUNS} - Time: {end_time - start_time:.4f} s, Output: {result}")
        
        # output_str = "{'action': 'CLICK', 'value': None, 'position': [0.28, 0.29]}"
        # parsed_output = ast.literal_eval(result)
        if result['action'].upper() == 'CLICK':
            x, y = result['position'][0], result['position'][1]
            draw_point_on_image(args.image_path, [x, y], output_path="./output_image.png")
        


avg_time = sum(times) / len(times)
print(f"Average per inference time: {avg_time:.4f} seconds")

# Print CUDA memory usage after inference
print(f"Max CUDA memory after inference: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")

# Print image size information
print(f"Input image size: {Image.open(args.image_path).size}")