import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import gradio as gr
import numpy as np
import tempfile
import os

# Set device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Function to generate caption for the image using BLIP
def generate_caption(image):
    inputs = processor(images=image, return_tensors="pt").to(device)
    output_ids = model.generate(**inputs)
    return processor.decode(output_ids[0], skip_special_tokens=True)

# Function to process images and generate captions
def process_images(image_files):
    captions = []
    for image_file in image_files:
        image = Image.open(image_file).convert('RGB')
        caption = generate_caption(image)
        captions.append(caption)
    return captions

# Setup Gradio interface
iface = gr.Interface(
    fn=process_images,
    inputs=[gr.Files(label="Upload Image Files")],
    outputs=[gr.Textbox(label="Image Captions")],
    title="Image Captioning with BLIP",
    description="Upload images and generate captions using the BLIP model from Hugging Face."
)

iface.launch(debug=True)