File size: 2,478 Bytes
cab1df1
ff6b5fc
cab1df1
ff6b5fc
 
 
cab1df1
ff6b5fc
cab1df1
 
 
460bccf
 
 
 
3a3e2e6
460bccf
 
 
 
 
cab1df1
 
 
 
ff6b5fc
 
 
 
cd7c5fe
ff6b5fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd7c5fe
ff6b5fc
 
 
 
 
 
cd7c5fe
ff6b5fc
59ee00b
3a3e2e6
 
ff6b5fc
3a3e2e6
cab1df1
 
460bccf
 
cab1df1
460bccf
3a3e2e6
 
460bccf
cab1df1
579e033
460bccf
cab1df1
 
 
460bccf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
import numpy as np
import gradio as gr

# Set the device (GPU or CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize processor and model
try:
    processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    model = AutoModelForVision2Seq.from_pretrained(
        "HuggingFaceTB/SmolVLM-Instruct",
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
    ).to(DEVICE)
except Exception as e:
    print(f"Error loading model or processor: {str(e)}")
    exit(1)


# Define the function to answer questions
def answer_question(image, question):
    # Check if the image is provided
    if image is None:
        return "Error: Please upload an image."

    # Convert NumPy array to PIL Image
    try:
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
    except Exception as e:
        return f"Error: Unable to process the image. {str(e)}"

    # Ensure question is provided
    if not question.strip():
        return "Error: Please provide a question."

    # Create input message for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        },
    ]

    # Apply chat template and prepare inputs
    try:
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
    except Exception as e:
        return f"Error: Failed to prepare inputs. {str(e)}"

    # Generate answer
    try:
        outputs = model.generate(**inputs, max_new_tokens=400)
        answer = processor.decode(outputs[0], skip_special_tokens=True)
        return answer
    except Exception as e:
        return f"Error: Failed to generate answer. {str(e)}"


# Create Gradio interface
iface = gr.Interface(
    fn=answer_question,
    inputs=[
        gr.Image(type="numpy"),
        gr.Textbox(lines=2, placeholder="Enter your question here..."),
    ],
    outputs="text",
    title="FAAM-demo | Vision Language Model | SmolVLM",
    description="Upload an image and ask a question about it.",
)

if __name__ == "__main__":
    iface.launch()