BobVLM-demo / app.py
maharshpatelx's picture
Change app.py Github link
d9ff7a1 verified
raw
history blame
3.04 kB
import gradio as gr
from BobVLM import BobVLMProcessor, load_model, pipeline
import torch
# Load model and processor
model = load_model()
processor = BobVLMProcessor()
# Create pipeline
pipe = pipeline(model, processor)
def analyze_image(image):
"""Process the image and return BobVLM's analysis."""
response = pipe(
chat=[
{"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
{"role": "user", "content": "Describe the image shortly"},
],
images=image
)
return response[0] if response else "I couldn't analyze this image."
# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
)) as demo:
gr.Markdown(
"""
# πŸ€– BobVLM Demo
This demo runs on cpu since I can't afford GPU prices here 🀧. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees
"""
)
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(
label="Upload Image",
type="pil",
height=400,
)
analyze_btn = gr.Button(
"πŸ” Analyze Image",
variant="primary",
size="lg",
)
with gr.Column(scale=1):
output_text = gr.Textbox(
label="BobVLM's Analysis",
placeholder="Analysis will appear here...",
lines=16,
show_copy_button=True,
)
# Add examples
gr.Examples(
examples=[
["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRmTRHBR1foifAUzxrQ5GOMyKgRX0iE7f9ivw&s"],
["https://i.guim.co.uk/img/media/1e0c3f8bbf09178377309c1f25ea326eaeb5aa0c/0_280_4200_2520/master/4200.jpg?width=1200&quality=85&auto=format&fit=max&s=858bf3e58ee96174b4b3d1499a324bc5"],
],
inputs=input_image,
outputs=output_text,
fn=analyze_image,
cache_examples=True,
)
# Set up the click event
analyze_btn.click(
fn=analyze_image,
inputs=input_image,
outputs=output_text,
)
gr.Markdown(
"""
### About BobVLM
BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural
image descriptions.
[View on GitHub](https://github.com/logic-OT/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch()