Spaces:
Running
Running
File size: 3,040 Bytes
3a40bac a458187 3a40bac a458187 3a40bac ce9221c 3a40bac a458187 3a40bac d9ff7a1 3a40bac d52e4b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
from BobVLM import BobVLMProcessor, load_model, pipeline
import torch
# Load model and processor
model = load_model()
processor = BobVLMProcessor()
# Create pipeline
pipe = pipeline(model, processor)
def analyze_image(image):
"""Process the image and return BobVLM's analysis."""
response = pipe(
chat=[
{"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
{"role": "user", "content": "Describe the image shortly"},
],
images=image
)
return response[0] if response else "I couldn't analyze this image."
# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
)) as demo:
gr.Markdown(
"""
# π€ BobVLM Demo
This demo runs on cpu since I can't afford GPU prices here π€§. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees
"""
)
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(
label="Upload Image",
type="pil",
height=400,
)
analyze_btn = gr.Button(
"π Analyze Image",
variant="primary",
size="lg",
)
with gr.Column(scale=1):
output_text = gr.Textbox(
label="BobVLM's Analysis",
placeholder="Analysis will appear here...",
lines=16,
show_copy_button=True,
)
# Add examples
gr.Examples(
examples=[
["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRmTRHBR1foifAUzxrQ5GOMyKgRX0iE7f9ivw&s"],
["https://i.guim.co.uk/img/media/1e0c3f8bbf09178377309c1f25ea326eaeb5aa0c/0_280_4200_2520/master/4200.jpg?width=1200&quality=85&auto=format&fit=max&s=858bf3e58ee96174b4b3d1499a324bc5"],
],
inputs=input_image,
outputs=output_text,
fn=analyze_image,
cache_examples=True,
)
# Set up the click event
analyze_btn.click(
fn=analyze_image,
inputs=input_image,
outputs=output_text,
)
gr.Markdown(
"""
### About BobVLM
BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural
image descriptions.
[View on GitHub](https://github.com/logic-OT/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|