File size: 3,040 Bytes
3a40bac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a458187
3a40bac
 
 
 
 
 
 
 
 
 
 
 
 
a458187
 
3a40bac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce9221c
 
3a40bac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a458187
3a40bac
 
d9ff7a1
3a40bac
 
 
 
 
d52e4b1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
from BobVLM import BobVLMProcessor, load_model, pipeline
import torch

# Load model and processor
model = load_model()
processor = BobVLMProcessor()

# Create pipeline
pipe = pipeline(model, processor)

def analyze_image(image):
    """Process the image and return BobVLM's analysis."""
    response = pipe(
        chat=[
            {"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
            {"role": "user", "content": "Describe the image shortly"},
        ],
        images=image
    )
    return response[0] if response else "I couldn't analyze this image."

# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="indigo",
    neutral_hue="slate",
)) as demo:
    gr.Markdown(
        """
        # πŸ€– BobVLM Demo
        This demo runs on cpu since I can't afford GPU prices here 🀧. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(
                label="Upload Image",
                type="pil",
                height=400,
            )
            analyze_btn = gr.Button(
                "πŸ” Analyze Image",
                variant="primary",
                size="lg",
            )
        
        with gr.Column(scale=1):
            output_text = gr.Textbox(
                label="BobVLM's Analysis",
                placeholder="Analysis will appear here...",
                lines=16,
                show_copy_button=True,
            )
    
    # Add examples
    gr.Examples(
        examples=[
            ["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRmTRHBR1foifAUzxrQ5GOMyKgRX0iE7f9ivw&s"],
            ["https://i.guim.co.uk/img/media/1e0c3f8bbf09178377309c1f25ea326eaeb5aa0c/0_280_4200_2520/master/4200.jpg?width=1200&quality=85&auto=format&fit=max&s=858bf3e58ee96174b4b3d1499a324bc5"],
        ],
        inputs=input_image,
        outputs=output_text,
        fn=analyze_image,
        cache_examples=True,
    )
    
    # Set up the click event
    analyze_btn.click(
        fn=analyze_image,
        inputs=input_image,
        outputs=output_text,
    )
    
    gr.Markdown(
        """
        ### About BobVLM
        BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
        It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural
        image descriptions.
        
        [View on GitHub](https://github.com/logic-OT/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
        """
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()