Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ def analyze_image(image):
|
|
14 |
response = pipe(
|
15 |
chat=[
|
16 |
{"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
|
17 |
-
{"role": "user", "content": "Describe the image"},
|
18 |
],
|
19 |
images=image
|
20 |
)
|
@@ -28,9 +28,8 @@ with gr.Blocks(theme=gr.themes.Soft(
|
|
28 |
)) as demo:
|
29 |
gr.Markdown(
|
30 |
"""
|
31 |
-
# 🤖 BobVLM
|
32 |
-
Upload an image and let BobVLM describe what it sees
|
33 |
-
with LLaMA's language understanding to provide detailed, natural descriptions of images.
|
34 |
"""
|
35 |
)
|
36 |
|
@@ -78,7 +77,7 @@ with gr.Blocks(theme=gr.themes.Soft(
|
|
78 |
"""
|
79 |
### About BobVLM
|
80 |
BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
|
81 |
-
It
|
82 |
image descriptions.
|
83 |
|
84 |
[View on GitHub](https://github.com/yourusername/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
|
|
|
14 |
response = pipe(
|
15 |
chat=[
|
16 |
{"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
|
17 |
+
{"role": "user", "content": "Describe the image shortly"},
|
18 |
],
|
19 |
images=image
|
20 |
)
|
|
|
28 |
)) as demo:
|
29 |
gr.Markdown(
|
30 |
"""
|
31 |
+
# 🤖 BobVLM Demo
|
32 |
+
This demo runs on cpu since I can't afford GPU prices here 🤧. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees
|
|
|
33 |
"""
|
34 |
)
|
35 |
|
|
|
77 |
"""
|
78 |
### About BobVLM
|
79 |
BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
|
80 |
+
It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural
|
81 |
image descriptions.
|
82 |
|
83 |
[View on GitHub](https://github.com/yourusername/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
|