Spaces:
Sleeping
Sleeping
import gradio as gr | |
from BobVLM import BobVLMProcessor, load_model, pipeline | |
import torch | |
# Load model and processor | |
model = load_model() | |
processor = BobVLMProcessor() | |
# Create pipeline | |
pipe = pipeline(model, processor) | |
def analyze_image(image): | |
"""Process the image and return BobVLM's analysis.""" | |
response = pipe( | |
chat=[ | |
{"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."}, | |
{"role": "user", "content": "Describe the image shortly"}, | |
], | |
images=image | |
) | |
return response[0] if response else "I couldn't analyze this image." | |
# Create the Gradio interface | |
with gr.Blocks(theme=gr.themes.Soft( | |
primary_hue="blue", | |
secondary_hue="indigo", | |
neutral_hue="slate", | |
)) as demo: | |
gr.Markdown( | |
""" | |
# π€ BobVLM Demo | |
This demo runs on cpu since I can't afford GPU prices here π€§. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
input_image = gr.Image( | |
label="Upload Image", | |
type="pil", | |
height=400, | |
) | |
analyze_btn = gr.Button( | |
"π Analyze Image", | |
variant="primary", | |
size="lg", | |
) | |
with gr.Column(scale=1): | |
output_text = gr.Textbox( | |
label="BobVLM's Analysis", | |
placeholder="Analysis will appear here...", | |
lines=16, | |
show_copy_button=True, | |
) | |
# Add examples | |
gr.Examples( | |
examples=[ | |
["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRmTRHBR1foifAUzxrQ5GOMyKgRX0iE7f9ivw&s"], | |
["https://i.guim.co.uk/img/media/1e0c3f8bbf09178377309c1f25ea326eaeb5aa0c/0_280_4200_2520/master/4200.jpg?width=1200&quality=85&auto=format&fit=max&s=858bf3e58ee96174b4b3d1499a324bc5"], | |
], | |
inputs=input_image, | |
outputs=output_text, | |
fn=analyze_image, | |
cache_examples=True, | |
) | |
# Set up the click event | |
analyze_btn.click( | |
fn=analyze_image, | |
inputs=input_image, | |
outputs=output_text, | |
) | |
gr.Markdown( | |
""" | |
### About BobVLM | |
BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities. | |
It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural | |
image descriptions. | |
[View on GitHub](https://github.com/logic-OT/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b) | |
""" | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() | |