hermanhelf commited on
Commit
9002c41
·
1 Parent(s): d914d44

simple edit to make simple

Browse files
Files changed (1) hide show
  1. app.py +58 -55
app.py CHANGED
@@ -14,7 +14,8 @@ import numpy as np
14
  import spaces
15
 
16
 
17
- model_id = "google/paligemma-3b-mix-448"
 
18
  COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(device)
@@ -64,23 +65,24 @@ def parse_segmentation(input_image, input_text):
64
 
65
  ######## Demo
66
 
67
- INTRO_TEXT = """## PaliGemma demo\n\n
68
- | [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
69
- | [Blogpost](https://huggingface.co/blog/paligemma)
70
- |\n\n
71
- PaliGemma is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
72
- built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
73
- vision model and the [Gemma](https://arxiv.org/abs/2403.08295) language model. PaliGemma is designed as a versatile
74
- model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
75
- answering, text reading, object detection and object segmentation.
76
- \n\n
77
- This space includes models fine-tuned on a mix of downstream tasks, **inferred via 🤗 transformers**.
78
- See the [Blogpost](https://huggingface.co/blog/paligemma) and
79
- [README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
80
- for detailed information how to use and fine-tune PaliGemma models.
81
- \n\n
82
- **This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
83
- """
 
84
 
85
 
86
  with gr.Blocks(css="style.css") as demo:
@@ -92,14 +94,15 @@ with gr.Blocks(css="style.css") as demo:
92
 
93
  text_output = gr.Text(label="Text Output")
94
  chat_btn = gr.Button()
95
- tokens = gr.Slider(
96
- label="Max New Tokens",
97
- info="Set to larger for longer generation.",
98
- minimum=10,
99
- maximum=100,
100
- value=20,
101
- step=10,
102
- )
 
103
 
104
  chat_inputs = [
105
  image,
@@ -127,34 +130,34 @@ with gr.Blocks(css="style.css") as demo:
127
  examples=examples,
128
  inputs=chat_inputs,
129
  )
130
- with gr.Tab("Segment/Detect"):
131
- image = gr.Image(type="pil")
132
- seg_input = gr.Text(label="Entities to Segment/Detect")
133
- seg_btn = gr.Button("Submit")
134
- annotated_image = gr.AnnotatedImage(label="Output")
135
-
136
- examples = [["./cats.png", "segment cats"],
137
- ["./bee.jpg", "detect bee"],
138
- ["./examples/barsik.jpg", "segment cat"],
139
- ["./bird.jpg", "segment bird ; bird ; plant"]]
140
- gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
141
- gr.Examples(
142
- examples=examples,
143
- inputs=[image, seg_input],
144
- )
145
-
146
- seg_inputs = [
147
- image,
148
- seg_input
149
- ]
150
- seg_outputs = [
151
- annotated_image
152
- ]
153
- seg_btn.click(
154
- fn=parse_segmentation,
155
- inputs=seg_inputs,
156
- outputs=seg_outputs,
157
- )
158
 
159
 
160
 
@@ -323,4 +326,4 @@ def extract_objs(text, width, height, unique_labels=False):
323
  #########
324
 
325
  if __name__ == "__main__":
326
- demo.queue(max_size=10).launch(debug=True)
 
14
  import spaces
15
 
16
 
17
+ # model_id = "google/paligemma-3b-mix-448"
18
+ model_id = "hermanhelf/paligemma"
19
  COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
20
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
  model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(device)
 
65
 
66
  ######## Demo
67
 
68
+ INTRO_TEXT = # """## PaliGemma demo\n\n
69
+ # | [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
70
+ # | [Blogpost](https://huggingface.co/blog/paligemma)
71
+ # |\n\n
72
+ # PaliGemma is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
73
+ # built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
74
+ # vision model and the [Gemma](https://arxiv.org/abs/2403.08295) language model. PaliGemma is designed as a versatile
75
+ # model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
76
+ # answering, text reading, object detection and object segmentation.
77
+ # \n\n
78
+ # This space includes models fine-tuned on a mix of downstream tasks, **inferred via 🤗 transformers**.
79
+ # See the [Blogpost](https://huggingface.co/blog/paligemma) and
80
+ # [README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
81
+ # for detailed information how to use and fine-tune PaliGemma models.
82
+ # \n\n
83
+ # **This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
84
+ # """
85
+ INTRO_TEXT = "## Demo\n\n"
86
 
87
 
88
  with gr.Blocks(css="style.css") as demo:
 
94
 
95
  text_output = gr.Text(label="Text Output")
96
  chat_btn = gr.Button()
97
+ # tokens = gr.Slider(
98
+ # label="Max New Tokens",
99
+ # info="Set to larger for longer generation.",
100
+ # minimum=10,
101
+ # maximum=100,
102
+ # value=20,
103
+ # step=10,
104
+ # )
105
+ tokens = 20
106
 
107
  chat_inputs = [
108
  image,
 
130
  examples=examples,
131
  inputs=chat_inputs,
132
  )
133
+ # with gr.Tab("Segment/Detect"):
134
+ # image = gr.Image(type="pil")
135
+ # seg_input = gr.Text(label="Entities to Segment/Detect")
136
+ # seg_btn = gr.Button("Submit")
137
+ # annotated_image = gr.AnnotatedImage(label="Output")
138
+
139
+ # examples = [["./cats.png", "segment cats"],
140
+ # ["./bee.jpg", "detect bee"],
141
+ # ["./examples/barsik.jpg", "segment cat"],
142
+ # ["./bird.jpg", "segment bird ; bird ; plant"]]
143
+ # gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
144
+ # gr.Examples(
145
+ # examples=examples,
146
+ # inputs=[image, seg_input],
147
+ # )
148
+
149
+ # seg_inputs = [
150
+ # image,
151
+ # seg_input
152
+ # ]
153
+ # seg_outputs = [
154
+ # annotated_image
155
+ # ]
156
+ # seg_btn.click(
157
+ # fn=parse_segmentation,
158
+ # inputs=seg_inputs,
159
+ # outputs=seg_outputs,
160
+ # )
161
 
162
 
163
 
 
326
  #########
327
 
328
  if __name__ == "__main__":
329
+ demo.queue(max_size=10).launch(debug=True)