selfDotOsman commited on
Commit
3a40bac
·
verified ·
1 Parent(s): 399be4f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from BobVLM import BobVLMProcessor, load_model, pipeline
3
+ import torch
4
+
5
+ # Load model and processor
6
+ model = load_model()
7
+ processor = BobVLMProcessor()
8
+
9
+ # Create pipeline
10
+ pipe = pipeline(model, processor)
11
+
12
+ def analyze_image(image):
13
+ """Process the image and return BobVLM's analysis."""
14
+ response = pipe(
15
+ chat=[
16
+ {"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
17
+ {"role": "user", "content": "Describe the image"},
18
+ ],
19
+ images=image
20
+ )
21
+ return response[0] if response else "I couldn't analyze this image."
22
+
23
+ # Create the Gradio interface
24
+ with gr.Blocks(theme=gr.themes.Soft(
25
+ primary_hue="blue",
26
+ secondary_hue="indigo",
27
+ neutral_hue="slate",
28
+ )) as demo:
29
+ gr.Markdown(
30
+ """
31
+ # 🤖 BobVLM Image Analyzer
32
+ Upload an image and let BobVLM describe what it sees. BobVLM combines CLIP's vision capabilities
33
+ with LLaMA's language understanding to provide detailed, natural descriptions of images.
34
+ """
35
+ )
36
+
37
+ with gr.Row():
38
+ with gr.Column(scale=1):
39
+ input_image = gr.Image(
40
+ label="Upload Image",
41
+ type="pil",
42
+ height=400,
43
+ )
44
+ analyze_btn = gr.Button(
45
+ "🔍 Analyze Image",
46
+ variant="primary",
47
+ size="lg",
48
+ )
49
+
50
+ with gr.Column(scale=1):
51
+ output_text = gr.Textbox(
52
+ label="BobVLM's Analysis",
53
+ placeholder="Analysis will appear here...",
54
+ lines=16,
55
+ show_copy_button=True,
56
+ )
57
+
58
+ # Add examples
59
+ gr.Examples(
60
+ examples=[
61
+ ["path/to/example1.jpg"],
62
+ ["path/to/example2.jpg"],
63
+ ],
64
+ inputs=input_image,
65
+ outputs=output_text,
66
+ fn=analyze_image,
67
+ cache_examples=True,
68
+ )
69
+
70
+ # Set up the click event
71
+ analyze_btn.click(
72
+ fn=analyze_image,
73
+ inputs=input_image,
74
+ outputs=output_text,
75
+ )
76
+
77
+ gr.Markdown(
78
+ """
79
+ ### About BobVLM
80
+ BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
81
+ It uses a specialized adapter layer to bridge the gap between vision and language, enabling detailed and natural
82
+ image descriptions.
83
+
84
+ [View on GitHub](https://github.com/yourusername/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
85
+ """
86
+ )
87
+
88
+ # Launch the app
89
+ if __name__ == "__main__":
90
+ demo.launch(
91
+ share=True,
92
+ enable_queue=True,
93
+ show_error=True,
94
+ )