Spaces:

sahirp
/

text_2_segment_any_vid

Running on Zero

App Files Files Community

er1t0 commited on Aug 3, 2024

Commit

8870220

1 Parent(s): 9b87d5a

torch autocast

Browse files

Files changed (2) hide show

.gitignore +3 -0
app.py +55 -46

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+temp_frames
+temp_frames_30
+segmented_video.mp4

app.py CHANGED Viewed

@@ -41,6 +41,8 @@ florence_model = load_model_without_flash_attn(load_florence_model)
 florence_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 def apply_color_mask(frame, mask, obj_id):
     cmap = plt.get_cmap("tab10")
     color = np.array(cmap(obj_id % 10)[:3])  # Use modulo 10 to cycle through colors
@@ -61,25 +63,26 @@ def apply_color_mask(frame, mask, obj_id):
     colored_mask = mask * color
     return frame * (1 - mask) + colored_mask * 255
 def run_florence(image, text_input):
-    with torch.amp.autocast(dtype=torch.bfloat16):
-        task_prompt = '<OPEN_VOCABULARY_DETECTION>'
-        prompt = task_prompt + text_input
-        inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
-        generated_ids = florence_model.generate(
-            input_ids=inputs["input_ids"].cuda(),
-            pixel_values=inputs["pixel_values"].cuda(),
-            max_new_tokens=1024,
-            early_stopping=False,
-            do_sample=False,
-            num_beams=3,
-        )
-        generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-        parsed_answer = florence_processor.post_process_generation(
-            generated_text,
-            task=task_prompt,
-            image_size=(image.width, image.height)
-        )
     return parsed_answer[task_prompt]['bboxes'][0]
 def remove_directory_contents(directory):
@@ -89,7 +92,8 @@ def remove_directory_contents(directory):
         for name in dirs:
             os.rmdir(os.path.join(root, name))
 def process_video(video_path, prompt):
     try:
         # Get video info
@@ -123,14 +127,13 @@ def process_video(video_path, prompt):
         print("Reshaped mask box:", mask_box)
         # SAM2 segmentation on first frame
-        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-            image_predictor.set_image(first_frame)
-            masks, _, _ = image_predictor.predict(
-                point_coords=None,
-                point_labels=None,
-                box=mask_box[None, :],
-                multimask_output=False,
-            )
         print("masks.shape", masks.shape)
         mask = masks.squeeze().astype(bool)
@@ -145,21 +148,20 @@ def process_video(video_path, prompt):
         print(f"Saved {len(frames)} temporary frames")
-        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-            inference_state = video_predictor.init_state(video_path=temp_dir)
-            _, _, _ = video_predictor.add_new_mask(
-                inference_state=inference_state,
-                frame_idx=0,
-                obj_id=1,
-                mask=mask
-            )
-            video_segments = {}
-            for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
-                video_segments[out_frame_idx] = {
-                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
-                    for i, out_obj_id in enumerate(out_obj_ids)
-                }
         print('Segmenting for main vid done')
         print(f"Number of segmented frames: {len(video_segments)}")
@@ -216,12 +218,19 @@ def segment_video(video_file, prompt):
 demo = gr.Interface(
     fn=segment_video,
     inputs=[
-        gr.Video(label="Upload Video"),
-        gr.Textbox(label="Enter prompt (e.g., 'a gymnast')")
     ],
     outputs=gr.Video(label="Segmented Video"),
-    title="Video Object Segmentation with Florence and SAM2",
-    description="Upload a video and provide a text prompt to segment a specific object throughout the video."
 )
 demo.launch()

 florence_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 def apply_color_mask(frame, mask, obj_id):
     cmap = plt.get_cmap("tab10")
     color = np.array(cmap(obj_id % 10)[:3])  # Use modulo 10 to cycle through colors
     colored_mask = mask * color
     return frame * (1 - mask) + colored_mask * 255
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
 def run_florence(image, text_input):
+    task_prompt = '<OPEN_VOCABULARY_DETECTION>'
+    prompt = task_prompt + text_input
+    inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.bfloat16)
+    generated_ids = florence_model.generate(
+        input_ids=inputs["input_ids"].cuda(),
+        pixel_values=inputs["pixel_values"].cuda(),
+        max_new_tokens=1024,
+        early_stopping=False,
+        do_sample=False,
+        num_beams=3,
+    )
+    generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = florence_processor.post_process_generation(
+        generated_text,
+        task=task_prompt,
+        image_size=(image.width, image.height)
+    )
     return parsed_answer[task_prompt]['bboxes'][0]
 def remove_directory_contents(directory):
         for name in dirs:
             os.rmdir(os.path.join(root, name))
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
 def process_video(video_path, prompt):
     try:
         # Get video info
         print("Reshaped mask box:", mask_box)
         # SAM2 segmentation on first frame
+        image_predictor.set_image(first_frame)
+        masks, _, _ = image_predictor.predict(
+            point_coords=None,
+            point_labels=None,
+            box=mask_box[None, :],
+            multimask_output=False,
+        )
         print("masks.shape", masks.shape)
         mask = masks.squeeze().astype(bool)
         print(f"Saved {len(frames)} temporary frames")
+        inference_state = video_predictor.init_state(video_path=temp_dir)
+        _, _, _ = video_predictor.add_new_mask(
+            inference_state=inference_state,
+            frame_idx=0,
+            obj_id=1,
+            mask=mask
+        )
+        video_segments = {}
+        for out_frame_idx, out_obj_ids, out_mask_logits in video_predictor.propagate_in_video(inference_state):
+            video_segments[out_frame_idx] = {
+                out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                for i, out_obj_id in enumerate(out_obj_ids)
+            }
         print('Segmenting for main vid done')
         print(f"Number of segmented frames: {len(video_segments)}")
 demo = gr.Interface(
     fn=segment_video,
     inputs=[
+        gr.Video(label="Upload Video (Keep it under 10 seconds for this demo)"),
+        gr.Textbox(label="Enter text prompt for object detection")
     ],
     outputs=gr.Video(label="Segmented Video"),
+    title="Text-Prompted Video Object Segmentation",
+    description="""
+    This demo uses [Florence-2](https://huggingface.co/microsoft/Florence-2-large), a vision-language model, to enable text-prompted object detection for [SAM2](https://github.com/facebookresearch/segment-anything).
+    Florence-2 interprets your text prompt, allowing SAM2 to segment the described object in the video.
+    1. Upload a short video (< 10 sec)
+    2. Describe the object to segment
+    3. Get your segmented video!
+    """
 )
 demo.launch()