from transformers import BlipProcessor, BlipForConditionalGeneration
from gradio import Interface
from PIL import Image

# Load the BLIP-2 model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")

def generate_response(image, prompt):
    """Generate a response from the model based on the image and prompt."""
    inputs = processor(image, prompt, return_tensors="pt")
    outputs = model.generate(**inputs)
    return processor.decode(outputs[0], skip_special_tokens=True)

# Create a Gradio interface
def predict(image, prompt):
    return generate_response(image, prompt)

interface = Interface(
    fn=predict,
    inputs=["image", "text"],
    outputs="text",
    title="BLIP-2: Introspective Monologue Generator",
    description="Upload an image and provide a prompt. The model will respond with introspective thoughts about the image."
)

if __name__ == "__main__":
    interface.launch()