Spaces:
Running
Running
File size: 2,478 Bytes
cab1df1 ff6b5fc cab1df1 ff6b5fc cab1df1 ff6b5fc cab1df1 460bccf 3a3e2e6 460bccf cab1df1 ff6b5fc cd7c5fe ff6b5fc cd7c5fe ff6b5fc cd7c5fe ff6b5fc 59ee00b 3a3e2e6 ff6b5fc 3a3e2e6 cab1df1 460bccf cab1df1 460bccf 3a3e2e6 460bccf cab1df1 579e033 460bccf cab1df1 460bccf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
import numpy as np
import gradio as gr
# Set the device (GPU or CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize processor and model
try:
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
"HuggingFaceTB/SmolVLM-Instruct",
torch_dtype=torch.bfloat16,
_attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)
except Exception as e:
print(f"Error loading model or processor: {str(e)}")
exit(1)
# Define the function to answer questions
def answer_question(image, question):
# Check if the image is provided
if image is None:
return "Error: Please upload an image."
# Convert NumPy array to PIL Image
try:
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
except Exception as e:
return f"Error: Unable to process the image. {str(e)}"
# Ensure question is provided
if not question.strip():
return "Error: Please provide a question."
# Create input message for the model
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": question},
],
},
]
# Apply chat template and prepare inputs
try:
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
except Exception as e:
return f"Error: Failed to prepare inputs. {str(e)}"
# Generate answer
try:
outputs = model.generate(**inputs, max_new_tokens=400)
answer = processor.decode(outputs[0], skip_special_tokens=True)
return answer
except Exception as e:
return f"Error: Failed to generate answer. {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=answer_question,
inputs=[
gr.Image(type="numpy"),
gr.Textbox(lines=2, placeholder="Enter your question here..."),
],
outputs="text",
title="FAAM-demo | Vision Language Model | SmolVLM",
description="Upload an image and ask a question about it.",
)
if __name__ == "__main__":
iface.launch()
|