kosmos-2 / app.py
Tonic's picture
Update app.py
e284ea0 verified
import torch
import requests
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import gradio as gr
def load_models():
# Load pre-trained models
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(device)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
return model, processor
def generate_description(image):
model, processor = load_models()
prompt = "<grounding>An image of"
inputs = processor(text=prompt, images=image, padding='max_length', truncation=True, return_tensors="pt")
# Move tensors to GPU if available
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate description
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
return generated_text
if __name__ == '__main__':
interface = gr.Interface(
generate_description,
["image"],
"text",
title="GPT-based Visual Storytelling",
description="Upload an image to get a detailed caption generated by our powerful AI!",
examples=[
['PRO-b0fe1914d67344d98e120a19cd1aadf1.jpg']
],
)
interface.launch()