Spaces:

KasKniesmeijer
/

FAAM-demo

Running

File size: 2,496 Bytes

cab1df1
ff6b5fc
cab1df1
ff6b5fc
 
 
cab1df1
ff6b5fc
cab1df1
 
 
460bccf
 
 
 
3a3e2e6
460bccf
 
 
 
 
cab1df1
 
 
 
ff6b5fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a3e2e6
ff6b5fc
3a3e2e6
 
 
ff6b5fc
3a3e2e6
cab1df1
 
460bccf
 
cab1df1
460bccf
3a3e2e6
 
460bccf
cab1df1
460bccf
 
cab1df1
 
 
460bccf

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
import numpy as np
import gradio as gr

# Set the device (GPU or CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize processor and model
try:
    processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    model = AutoModelForVision2Seq.from_pretrained(
        "HuggingFaceTB/SmolVLM-Instruct",
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
    ).to(DEVICE)
except Exception as e:
    print(f"Error loading model or processor: {str(e)}")
    exit(1)


# Define the function to answer questions
def answer_question(image, question):
    # Check if the image is provided
    if image is None:
        return "Error: Please upload an image."

    # Convert NumPy array to PIL Image if necessary
    try:
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
    except Exception as e:
        return f"Error: Unable to process the image. {str(e)}"

    # Ensure question is provided
    if not question.strip():
        return "Error: Please provide a question."

    # Create input message for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        },
    ]

    # Apply chat template (this assumes the processor has a chat-based input format)
    try:
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
    except Exception as e:
        return f"Error: Failed to prepare inputs. {str(e)}"

    # Generate the answer
    try:
        outputs = model.generate(**inputs)
        answer = processor.decode(outputs[0], skip_special_tokens=True)
        return answer
    except Exception as e:
        return f"Error: Failed to generate answer. {str(e)}"


# Create Gradio interface
iface = gr.Interface(
    fn=answer_question,
    inputs=[
        gr.Image(type="numpy"),
        gr.Textbox(lines=2, placeholder="Enter your question here..."),
    ],
    outputs="text",
    title="Image Question Answering",
    description="Upload an image and ask a question about it.",
)

if __name__ == "__main__":
    iface.launch()