import torch from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import gradio as gr import numpy as np import tempfile import os # Set device to GPU if available, otherwise use CPU device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Load the BLIP model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device) # Function to generate caption for the image using BLIP def generate_caption(image): inputs = processor(images=image, return_tensors="pt").to(device) output_ids = model.generate(**inputs) return processor.decode(output_ids[0], skip_special_tokens=True) # Function to process images and generate captions def process_images(image_files): captions = [] for image_file in image_files: image = Image.open(image_file).convert('RGB') caption = generate_caption(image) captions.append(caption) return captions # Setup Gradio interface iface = gr.Interface( fn=process_images, inputs=[gr.Files(label="Upload Image Files")], outputs=[gr.Textbox(label="Image Captions")], title="Image Captioning with BLIP", description="Upload images and generate captions using the BLIP model from Hugging Face." ) iface.launch(debug=True)