import gradio as gr import numpy as np from PIL import Image import requests from io import BytesIO from transformers import AutoProcessor, BlipForConditionalGeneration # Load the pretrained processor and model processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") def fetch_image(url: str) -> np.ndarray: """Fetch an image from a given URL and return it as a numpy array.""" try: response = requests.get(url, stream=True) response.raise_for_status() image = Image.open(response.raw).convert('RGB') return np.array(image) except Exception as e: raise ValueError(f"Failed to fetch image: {str(e)}") def caption_image(input_image=None, image_url=None): """Generate captions for the input image or image fetched from a URL.""" try: if image_url: image_array = fetch_image(image_url) elif input_image is not None: image_array = input_image else: raise ValueError("Please provide either an image or an image URL.") # Ensure the image is in RGB format pil_image = Image.fromarray(image_array).convert('RGB') # Process the image and generate caption inputs = processor(pil_image, return_tensors="pt") out = model.generate(**inputs, max_length=50) caption = processor.decode(out[0], skip_special_tokens=True) # Save caption as a downloadable .txt file caption_path = "caption.txt" with open(caption_path, "w") as f: f.write(caption) return caption, caption_path except Exception as e: return f"Error: {str(e)}", None iface = gr.Interface( fn=caption_image, inputs=[ gr.Image(type="numpy", label="Upload Image"), gr.Textbox(label="Image URL (Optional)", placeholder="Enter an image URL here") ], outputs=[ gr.Textbox(label="Generated Caption"), gr.File(label="Download Caption") ], examples = [ ["model.jpg"], ["horse.jpeg"], ["panda.jpg"] ], title="Advanced Image Captioning with the BLIP model", description="Upload an image or provide a URL to an image to generate a caption. You can also drag and drop the example images. Download the generated caption as a .txt file if needed.", live=True, theme="compact" ) iface.launch()