File size: 2,346 Bytes
2f8eff4
 
 
 
 
 
 
 
 
99f09a0
2f8eff4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99f09a0
2f8eff4
 
99f09a0
2f8eff4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# import gradio as gr

# def greet(name):
#     return "Hello " + name + "!!"

# demo = gr.Interface(fn=greet, inputs="text", outputs="text")
# demo.launch()


import gradio as gr
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

# Load the Llama 3.2 Vision Model
def load_llama_model():
    model_id = "meta-llama/Llama-3.2-11B-Vision"

    # Load model and processor
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    processor = AutoProcessor.from_pretrained(model_id)

    return model, processor

# Function to generate predictions for text and image
def process_input(text, image=None):
    model, processor = load_llama_model()

    if image:
        # If an image is uploaded, process it as a PIL Image object
        vision_input = image.convert("RGB").resize((224, 224))

        prompt = f"<|image|><|begin_of_text|>{text}"

        # Process image and text together
        inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
    else:
        # If no image is uploaded, just process the text
        prompt = f"<|begin_of_text|>{text}"
        inputs = processor(prompt, return_tensors="pt").to(model.device)

    # Generate output from the model
    outputs = model.generate(**inputs, max_new_tokens=100)

    # Decode the output to return a readable text
    decoded_output = processor.decode(outputs[0], skip_special_tokens=True)

    return decoded_output

# Gradio Interface Setup
def demo():
    # Define Gradio input and output components
    text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)

    # Use type="pil" to work with PIL Image objects
    image_input = gr.Image(label="Upload an Image", type="pil")

    output = gr.Textbox(label="Model Output", lines=5)

    # Define the interface layout
    interface = gr.Interface(
        fn=process_input,
        inputs=[text_input, image_input],
        outputs=output,
        title="Llama 3.2 Multimodal Text-Image Analyzer",
        description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
    )

    # Launch the demo
    interface.launch()

# Run the demo
if __name__ == "__main__":
    demo()