Mubbashir Ahmed
initials
7e42f7f
raw
history blame
1.45 kB
import gradio as gr
from transformers import AutoProcessor, Llama4ForConditionalGeneration
import torch
model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
processor = AutoProcessor.from_pretrained(model_id)
model = Llama4ForConditionalGeneration.from_pretrained(
model_id,
attn_implementation="flex_attention",
device_map="auto",
torch_dtype=torch.bfloat16,
)
def analyze_images(image1, image2, question):
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image1},
{"type": "image", "image": image2},
{"type": "text", "text": question}
]
}
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=256,
)
response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
return response.strip()
gr.Interface(
fn=analyze_images,
inputs=[
gr.Image(type="pil", label="Image 1"),
gr.Image(type="pil", label="Image 2"),
gr.Textbox(lines=2, label="Your Question"),
],
outputs="text",
title="LLaMA 4 Multimodal Visual Q&A",
description="Upload two images and ask a question — powered by LLaMA 4"
).launch()