Spaces:
Running
on
Zero
Running
on
Zero
onuralpszr
commited on
fix: π single input for detection and handle class names
Browse filesSigned-off-by: Onuralp SEZER <[email protected]>
app.py
CHANGED
@@ -54,6 +54,12 @@ model_id = "google/paligemma2-3b-pt-448"
|
|
54 |
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
|
55 |
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
@spaces.GPU
|
58 |
def paligemma_detection(input_image, input_text, max_new_tokens):
|
59 |
model_inputs = processor(text=input_text,
|
@@ -70,13 +76,17 @@ def paligemma_detection(input_image, input_text, max_new_tokens):
|
|
70 |
|
71 |
|
72 |
|
73 |
-
def annotate_image(result, resolution_wh,
|
|
|
|
|
|
|
|
|
74 |
|
75 |
detections = sv.Detections.from_lmm(
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
)
|
81 |
|
82 |
annotated_image = BOX_ANNOTATOR.annotate(
|
@@ -98,17 +108,17 @@ def annotate_image(result, resolution_wh, class_names, cv_image):
|
|
98 |
return annotated_image
|
99 |
|
100 |
|
101 |
-
def process_image(input_image, input_text,
|
102 |
cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
|
103 |
result = paligemma_detection(input_image, input_text, max_new_tokens)
|
104 |
annotated_image = annotate_image(result,
|
105 |
(input_image.width, input_image.height),
|
106 |
-
|
107 |
return annotated_image, result
|
108 |
|
109 |
|
110 |
@spaces.GPU
|
111 |
-
def process_video(input_video, input_text,
|
112 |
if not input_video:
|
113 |
gr.Info("Please upload a video.")
|
114 |
return None
|
@@ -117,6 +127,11 @@ def process_video(input_video, input_text, class_names, max_new_tokens, progress
|
|
117 |
gr.Info("Please enter a text prompt.")
|
118 |
return None
|
119 |
|
|
|
|
|
|
|
|
|
|
|
120 |
name = generate_unique_name()
|
121 |
frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
|
122 |
create_directory(frame_directory_path)
|
@@ -146,7 +161,7 @@ def process_video(input_video, input_text, class_names, max_new_tokens, progress
|
|
146 |
sv.LMM.PALIGEMMA,
|
147 |
result,
|
148 |
resolution_wh=(video_info.width, video_info.height),
|
149 |
-
classes=class_names
|
150 |
)
|
151 |
|
152 |
annotated_frame = BOX_ANNOTATOR.annotate(
|
@@ -177,15 +192,18 @@ with gr.Blocks() as app:
|
|
177 |
with gr.Row():
|
178 |
with gr.Column():
|
179 |
input_image = gr.Image(type="pil", label="Input Image")
|
180 |
-
input_text = gr.Textbox(
|
181 |
-
|
|
|
|
|
|
|
182 |
max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
|
183 |
with gr.Column():
|
184 |
annotated_image = gr.Image(type="pil", label="Annotated Image")
|
185 |
detection_result = gr.Textbox(label="Detection Result")
|
186 |
gr.Button("Submit").click(
|
187 |
fn=process_image,
|
188 |
-
inputs=[input_image, input_text,
|
189 |
outputs=[annotated_image, detection_result]
|
190 |
)
|
191 |
|
@@ -193,8 +211,11 @@ with gr.Blocks() as app:
|
|
193 |
with gr.Row():
|
194 |
with gr.Column():
|
195 |
input_video = gr.Video(label="Input Video")
|
196 |
-
input_text = gr.Textbox(
|
197 |
-
|
|
|
|
|
|
|
198 |
max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")
|
199 |
with gr.Column():
|
200 |
output_video = gr.Video(label="Annotated Video")
|
@@ -202,7 +223,7 @@ with gr.Blocks() as app:
|
|
202 |
|
203 |
gr.Button("Process Video").click(
|
204 |
fn=process_video,
|
205 |
-
inputs=[input_video, input_text,
|
206 |
outputs=[output_video, detection_result]
|
207 |
)
|
208 |
|
|
|
54 |
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
|
55 |
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
56 |
|
57 |
+
def parse_class_names(prompt):
|
58 |
+
if not prompt.lower().startswith('detect '):
|
59 |
+
return []
|
60 |
+
classes_text = prompt[7:].strip()
|
61 |
+
return [cls.strip() for cls in classes_text.split(';') if cls.strip()]
|
62 |
+
|
63 |
@spaces.GPU
|
64 |
def paligemma_detection(input_image, input_text, max_new_tokens):
|
65 |
model_inputs = processor(text=input_text,
|
|
|
76 |
|
77 |
|
78 |
|
79 |
+
def annotate_image(result, resolution_wh, prompt, cv_image):
|
80 |
+
class_names = parse_class_names(prompt)
|
81 |
+
if not class_names:
|
82 |
+
gr.Warning("Invalid prompt format. Please use 'detect class1;class2;class3' format")
|
83 |
+
return cv_image
|
84 |
|
85 |
detections = sv.Detections.from_lmm(
|
86 |
+
sv.LMM.PALIGEMMA,
|
87 |
+
result,
|
88 |
+
resolution_wh=resolution_wh,
|
89 |
+
classes=class_names
|
90 |
)
|
91 |
|
92 |
annotated_image = BOX_ANNOTATOR.annotate(
|
|
|
108 |
return annotated_image
|
109 |
|
110 |
|
111 |
+
def process_image(input_image, input_text, max_new_tokens):
|
112 |
cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
|
113 |
result = paligemma_detection(input_image, input_text, max_new_tokens)
|
114 |
annotated_image = annotate_image(result,
|
115 |
(input_image.width, input_image.height),
|
116 |
+
input_text, cv_image)
|
117 |
return annotated_image, result
|
118 |
|
119 |
|
120 |
@spaces.GPU
|
121 |
+
def process_video(input_video, input_text, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
|
122 |
if not input_video:
|
123 |
gr.Info("Please upload a video.")
|
124 |
return None
|
|
|
127 |
gr.Info("Please enter a text prompt.")
|
128 |
return None
|
129 |
|
130 |
+
class_names = parse_class_names(input_text)
|
131 |
+
if not class_names:
|
132 |
+
gr.Warning("Invalid prompt format. Please use 'detect class1;class2;class3' format")
|
133 |
+
return None, None
|
134 |
+
|
135 |
name = generate_unique_name()
|
136 |
frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
|
137 |
create_directory(frame_directory_path)
|
|
|
161 |
sv.LMM.PALIGEMMA,
|
162 |
result,
|
163 |
resolution_wh=(video_info.width, video_info.height),
|
164 |
+
classes=class_names
|
165 |
)
|
166 |
|
167 |
annotated_frame = BOX_ANNOTATOR.annotate(
|
|
|
192 |
with gr.Row():
|
193 |
with gr.Column():
|
194 |
input_image = gr.Image(type="pil", label="Input Image")
|
195 |
+
input_text = gr.Textbox(
|
196 |
+
lines=2,
|
197 |
+
placeholder="Enter prompt in format like this: detect person;dog;building",
|
198 |
+
label="Enter detection prompt"
|
199 |
+
)
|
200 |
max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
|
201 |
with gr.Column():
|
202 |
annotated_image = gr.Image(type="pil", label="Annotated Image")
|
203 |
detection_result = gr.Textbox(label="Detection Result")
|
204 |
gr.Button("Submit").click(
|
205 |
fn=process_image,
|
206 |
+
inputs=[input_image, input_text, max_new_tokens],
|
207 |
outputs=[annotated_image, detection_result]
|
208 |
)
|
209 |
|
|
|
211 |
with gr.Row():
|
212 |
with gr.Column():
|
213 |
input_video = gr.Video(label="Input Video")
|
214 |
+
input_text = gr.Textbox(
|
215 |
+
lines=2,
|
216 |
+
placeholder="Enter prompt in format like this: detect person;dog;building",
|
217 |
+
label="Enter detection prompt"
|
218 |
+
)
|
219 |
max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")
|
220 |
with gr.Column():
|
221 |
output_video = gr.Video(label="Annotated Video")
|
|
|
223 |
|
224 |
gr.Button("Process Video").click(
|
225 |
fn=process_video,
|
226 |
+
inputs=[input_video, input_text, max_new_tokens],
|
227 |
outputs=[output_video, detection_result]
|
228 |
)
|
229 |
|