File size: 2,578 Bytes
99f09a0
2f8eff4
 
 
cd7160b
2f8eff4
4ba32da
6b51df0
4ba32da
 
 
2f8eff4
 
 
ecebb24
2f8eff4
 
 
 
 
d67efb9
835d83d
 
 
2f8eff4
 
 
 
 
f6ae4ba
2f8eff4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657202e
2f8eff4
 
 
 
 
 
 
 
 
 
dc10bd3
2f8eff4
7171a2f
 
213e6ee
 
7171a2f
 
2f8eff4
 
 
 
 
7171a2f
2f8eff4
7171a2f
2f8eff4
99f09a0
2f8eff4
 
99f09a0
2f8eff4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import requests
import torch
from PIL import Image
import spaces
from transformers import MllamaForConditionalGeneration, AutoProcessor
import os
from huggingface_hub import login

huggingface_token = os.getenv("SECRET_ENV_VARIABLE")
login(huggingface_token)

# Load the Llama 3.2 Vision Model
def load_llama_model():
    model_id = "meta-llama/Llama-3.2-11B-Vision"

    # Load model and processor
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        offload_folder="offload", 
    )
    model.tie_weights() 
    processor = AutoProcessor.from_pretrained(model_id)

    return model, processor

# Function to generate predictions for text and image
@spaces.GPU
def process_input(text, image=None):
    model, processor = load_llama_model()

    if image:
        # If an image is uploaded, process it as a PIL Image object
        vision_input = image.convert("RGB").resize((224, 224))

        prompt = f"<|image|><|begin_of_text|>{text}"

        # Process image and text together
        inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
    else:
        # If no image is uploaded, just process the text
        prompt = f"<|begin_of_text|>{text}"
        inputs = processor(prompt, return_tensors="pt").to(model.device)

    # Generate output from the model
    outputs = model.generate(**inputs, max_new_tokens=50)

    # Decode the output to return a readable text
    decoded_output = processor.decode(outputs[0], skip_special_tokens=True)

    return decoded_output

def demo():
    # Define Gradio input and output components
    text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
    image_input = gr.Image(label="Upload an Image", type="pil")
    output = gr.Textbox(label="Model Output", lines=3)

    # Add two examples for multimodal analysis
    examples = [
        ["The llama is ", "./examples/llama.png"],
        ["The cute hampster is wearing ", "./examples/hampster.png"]
    ]

    # Define the interface layout
    interface = gr.Interface(
        fn=process_input,
        inputs=[text_input, image_input],
        outputs=output,
        examples=examples,
        title="Llama 3.2 Multimodal Text-Image Analyzer",
        description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.",
    )

    # Launch the demo
    interface.launch()

# Run the demo
if __name__ == "__main__":
    demo()