Mubbashir Ahmed commited on
Commit
7e42f7f
·
1 Parent(s): dfdfb4b
Files changed (2) hide show
  1. app.py +53 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, Llama4ForConditionalGeneration
3
+ import torch
4
+
5
+ model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
6
+
7
+ processor = AutoProcessor.from_pretrained(model_id)
8
+ model = Llama4ForConditionalGeneration.from_pretrained(
9
+ model_id,
10
+ attn_implementation="flex_attention",
11
+ device_map="auto",
12
+ torch_dtype=torch.bfloat16,
13
+ )
14
+
15
+ def analyze_images(image1, image2, question):
16
+ messages = [
17
+ {
18
+ "role": "user",
19
+ "content": [
20
+ {"type": "image", "image": image1},
21
+ {"type": "image", "image": image2},
22
+ {"type": "text", "text": question}
23
+ ]
24
+ }
25
+ ]
26
+
27
+ inputs = processor.apply_chat_template(
28
+ messages,
29
+ add_generation_prompt=True,
30
+ tokenize=True,
31
+ return_dict=True,
32
+ return_tensors="pt",
33
+ ).to(model.device)
34
+
35
+ outputs = model.generate(
36
+ **inputs,
37
+ max_new_tokens=256,
38
+ )
39
+
40
+ response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
41
+ return response.strip()
42
+
43
+ gr.Interface(
44
+ fn=analyze_images,
45
+ inputs=[
46
+ gr.Image(type="pil", label="Image 1"),
47
+ gr.Image(type="pil", label="Image 2"),
48
+ gr.Textbox(lines=2, label="Your Question"),
49
+ ],
50
+ outputs="text",
51
+ title="LLaMA 4 Multimodal Visual Q&A",
52
+ description="Upload two images and ask a question — powered by LLaMA 4"
53
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers>=4.41.0
2
+ torch>=2.2.0
3
+ gradio>=4.24.0