File size: 2,496 Bytes
cab1df1
ff6b5fc
cab1df1
ff6b5fc
 
 
cab1df1
ff6b5fc
cab1df1
 
 
460bccf
 
 
 
3a3e2e6
460bccf
 
 
 
 
cab1df1
 
 
 
ff6b5fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a3e2e6
ff6b5fc
3a3e2e6
 
 
ff6b5fc
3a3e2e6
cab1df1
 
460bccf
 
cab1df1
460bccf
3a3e2e6
 
460bccf
cab1df1
460bccf
 
cab1df1
 
 
460bccf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
import numpy as np
import gradio as gr

# Set the device (GPU or CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize processor and model
try:
    processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    model = AutoModelForVision2Seq.from_pretrained(
        "HuggingFaceTB/SmolVLM-Instruct",
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
    ).to(DEVICE)
except Exception as e:
    print(f"Error loading model or processor: {str(e)}")
    exit(1)


# Define the function to answer questions
def answer_question(image, question):
    # Check if the image is provided
    if image is None:
        return "Error: Please upload an image."

    # Convert NumPy array to PIL Image if necessary
    try:
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
    except Exception as e:
        return f"Error: Unable to process the image. {str(e)}"

    # Ensure question is provided
    if not question.strip():
        return "Error: Please provide a question."

    # Create input message for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        },
    ]

    # Apply chat template (this assumes the processor has a chat-based input format)
    try:
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
    except Exception as e:
        return f"Error: Failed to prepare inputs. {str(e)}"

    # Generate the answer
    try:
        outputs = model.generate(**inputs)
        answer = processor.decode(outputs[0], skip_special_tokens=True)
        return answer
    except Exception as e:
        return f"Error: Failed to generate answer. {str(e)}"


# Create Gradio interface
iface = gr.Interface(
    fn=answer_question,
    inputs=[
        gr.Image(type="numpy"),
        gr.Textbox(lines=2, placeholder="Enter your question here..."),
    ],
    outputs="text",
    title="Image Question Answering",
    description="Upload an image and ask a question about it.",
)

if __name__ == "__main__":
    iface.launch()