|
import gradio as gr |
|
from transformers import AutoProcessor, Llama4ForConditionalGeneration |
|
import torch |
|
|
|
model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" |
|
|
|
processor = AutoProcessor.from_pretrained(model_id) |
|
model = Llama4ForConditionalGeneration.from_pretrained( |
|
model_id, |
|
attn_implementation="flex_attention", |
|
device_map="auto", |
|
torch_dtype=torch.bfloat16, |
|
) |
|
|
|
def analyze_images(image1, image2, question): |
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image", "image": image1}, |
|
{"type": "image", "image": image2}, |
|
{"type": "text", "text": question} |
|
] |
|
} |
|
] |
|
|
|
inputs = processor.apply_chat_template( |
|
messages, |
|
add_generation_prompt=True, |
|
tokenize=True, |
|
return_dict=True, |
|
return_tensors="pt", |
|
).to(model.device) |
|
|
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=256, |
|
) |
|
|
|
response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0] |
|
return response.strip() |
|
|
|
gr.Interface( |
|
fn=analyze_images, |
|
inputs=[ |
|
gr.Image(type="pil", label="Image 1"), |
|
gr.Image(type="pil", label="Image 2"), |
|
gr.Textbox(lines=2, label="Your Question"), |
|
], |
|
outputs="text", |
|
title="LLaMA 4 Multimodal Visual Q&A", |
|
description="Upload two images and ask a question — powered by LLaMA 4" |
|
).launch() |
|
|