import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration import torch from PIL import Image from io import BytesIO import requests # Load model and processor model_name = "Salesforce/blip-image-captioning-large" processor = BlipProcessor.from_pretrained(model_name) model = BlipForConditionalGeneration.from_pretrained(model_name) def generate_caption(image): # Preprocess the image inputs = processor(image, return_tensors="pt") # Generate caption with torch.no_grad(): outputs = model.generate(**inputs) # Decode and return caption caption = processor.decode(outputs[0], skip_special_tokens=True) return caption # Create a Gradio interface iface = gr.Interface(fn=generate_caption, inputs="image", outputs="text") iface.launch(share=True) # `share=True` to get a public link