import gradio as gr from transformers import AutoProcessor, Llama4ForConditionalGeneration import torch model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" processor = AutoProcessor.from_pretrained(model_id) model = Llama4ForConditionalGeneration.from_pretrained( model_id, attn_implementation="flex_attention", device_map="auto", torch_dtype=torch.bfloat16, ) def analyze_images(image1, image2, question): messages = [ { "role": "user", "content": [ {"type": "image", "image": image1}, {"type": "image", "image": image2}, {"type": "text", "text": question} ] } ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate( **inputs, max_new_tokens=256, ) response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0] return response.strip() gr.Interface( fn=analyze_images, inputs=[ gr.Image(type="pil", label="Image 1"), gr.Image(type="pil", label="Image 2"), gr.Textbox(lines=2, label="Your Question"), ], outputs="text", title="LLaMA 4 Multimodal Visual Q&A", description="Upload two images and ask a question — powered by LLaMA 4" ).launch()