import gradio as gr from BobVLM import BobVLMProcessor, load_model, pipeline import torch # Load model and processor model = load_model() processor = BobVLMProcessor() # Create pipeline pipe = pipeline(model, processor) def analyze_image(image): """Process the image and return BobVLM's analysis.""" response = pipe( chat=[ {"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."}, {"role": "user", "content": "Describe the image shortly"}, ], images=image ) return response[0] if response else "I couldn't analyze this image." # Create the Gradio interface with gr.Blocks(theme=gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", )) as demo: gr.Markdown( """ # 🤖 BobVLM Demo This demo runs on cpu since I can't afford GPU prices here 🤧. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees """ ) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image( label="Upload Image", type="pil", height=400, ) analyze_btn = gr.Button( "🔍 Analyze Image", variant="primary", size="lg", ) with gr.Column(scale=1): output_text = gr.Textbox( label="BobVLM's Analysis", placeholder="Analysis will appear here...", lines=16, show_copy_button=True, ) # Add examples gr.Examples( examples=[ ["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRmTRHBR1foifAUzxrQ5GOMyKgRX0iE7f9ivw&s"], ["https://i.guim.co.uk/img/media/1e0c3f8bbf09178377309c1f25ea326eaeb5aa0c/0_280_4200_2520/master/4200.jpg?width=1200&quality=85&auto=format&fit=max&s=858bf3e58ee96174b4b3d1499a324bc5"], ], inputs=input_image, outputs=output_text, fn=analyze_image, cache_examples=True, ) # Set up the click event analyze_btn.click( fn=analyze_image, inputs=input_image, outputs=output_text, ) gr.Markdown( """ ### About BobVLM BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities. It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural image descriptions. [View on GitHub](https://github.com/yourusername/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b) """ ) # Launch the app if __name__ == "__main__": demo.launch()