selfDotOsman commited on
Commit
a458187
·
verified ·
1 Parent(s): ce9221c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -5
app.py CHANGED
@@ -14,7 +14,7 @@ def analyze_image(image):
14
  response = pipe(
15
  chat=[
16
  {"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
17
- {"role": "user", "content": "Describe the image"},
18
  ],
19
  images=image
20
  )
@@ -28,9 +28,8 @@ with gr.Blocks(theme=gr.themes.Soft(
28
  )) as demo:
29
  gr.Markdown(
30
  """
31
- # 🤖 BobVLM Image Analyzer
32
- Upload an image and let BobVLM describe what it sees. BobVLM combines CLIP's vision capabilities
33
- with LLaMA's language understanding to provide detailed, natural descriptions of images.
34
  """
35
  )
36
 
@@ -78,7 +77,7 @@ with gr.Blocks(theme=gr.themes.Soft(
78
  """
79
  ### About BobVLM
80
  BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
81
- It uses a specialized adapter layer to bridge the gap between vision and language, enabling detailed and natural
82
  image descriptions.
83
 
84
  [View on GitHub](https://github.com/yourusername/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
 
14
  response = pipe(
15
  chat=[
16
  {"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
17
+ {"role": "user", "content": "Describe the image shortly"},
18
  ],
19
  images=image
20
  )
 
28
  )) as demo:
29
  gr.Markdown(
30
  """
31
+ # 🤖 BobVLM Demo
32
+ This demo runs on cpu since I can't afford GPU prices here 🤧. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees
 
33
  """
34
  )
35
 
 
77
  """
78
  ### About BobVLM
79
  BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
80
+ It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural
81
  image descriptions.
82
 
83
  [View on GitHub](https://github.com/yourusername/BobVLM) | [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)