from transformers import BlipProcessor, BlipForConditionalGeneration from gradio import Interface from PIL import Image # Load the BLIP-2 model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip2-flan-t5-xl") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl") def generate_response(image, prompt): """Generate a response from the model based on the image and prompt.""" inputs = processor(image, prompt, return_tensors="pt") outputs = model.generate(**inputs) return processor.decode(outputs[0], skip_special_tokens=True) # Create a Gradio interface def predict(image, prompt): return generate_response(image, prompt) interface = Interface( fn=predict, inputs=["image", "text"], outputs="text", title="BLIP-2: Introspective Monologue Generator", description="Upload an image and provide a prompt. The model will respond with introspective thoughts about the image." ) if __name__ == "__main__": interface.launch()