Spaces:
Configuration error
Configuration error
File size: 8,192 Bytes
bf481cc 6879770 bf481cc 6879770 bf481cc 6879770 bf481cc 6879770 bf481cc 27af9b9 bf481cc bed6951 bf481cc bed6951 bf481cc bed6951 bf481cc bed6951 bf481cc bed6951 bf481cc bed6951 bf481cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import cv2
import os
import torch
import numpy as np
from PIL import Image
import supervision as sv
from tqdm import tqdm
from utils.video import generate_unique_name, create_directory, delete_directory
from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
from utils.sam import load_sam_image_model, load_sam_video_model, run_sam_inference
class VideoProcessor:
def __init__(self, device=None):
self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.autocast = None
self.florence_model = None
self.florence_processor = None
self.sam_image_model = None
self.sam_video_model = None
# Set up mask annotator with a white color palette
self.mask_annotator = sv.MaskAnnotator(
color=sv.ColorPalette.from_hex(["#FFFFFF"]),
color_lookup=sv.ColorLookup.INDEX
)
def _load_models(self):
# Load models
self.florence_model, self.florence_processor = load_florence_model(device=self.device)
self.sam_image_model = load_sam_image_model(device=self.device)
self.sam_video_model = load_sam_video_model(device=self.device)
def _enable_mixed_precision(self):
self.autocast = torch.autocast(device_type=self.device.type, dtype=torch.bfloat16)
self.autocast.__enter__()
if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
def _reset_mixed_precision(self):
# Exit the autocast context
self.autocast.__exit__(None, None, None)
# Reset CUDA settings
if torch.cuda.is_available():
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
torch.cuda.empty_cache() # Clear the CUDA cache
def process_video(self, video_path, scale_factor, prompt):
self.scale_factor = scale_factor
# Process video based on the prompt
output_video_path, session_path, input_frames_dir, output_directory_path = self._process_prompt(video_path, prompt)
# Create frames from the output video
fps = self._create_frames(output_video_path, output_directory_path)
# Delete the output video
os.remove(output_video_path)
return session_path, fps, input_frames_dir, output_directory_path
def _create_frames(self, video_path, output_dir):
create_directory(output_dir)
# get the video frame width and height
cap = cv2.VideoCapture(video_path)
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
fps = cap.get(cv2.CAP_PROP_FPS)
# open the video file
cap = cv2.VideoCapture(video_path)
# Now save all the frames to output_frames folder
count = 0
while True:
ret, frame = cap.read()
if not ret:
break
frame = cv2.resize(frame, (frame_width, frame_height))
cv2.imwrite(f"{output_dir}/frame_{count:04d}.jpg", frame)
count += 1
return fps
def _process_prompt(self, video_path, prompt):
# Process the first frame with the prompt using the loaded models
frame_generator = sv.get_video_frames_generator(video_path)
frame = next(frame_generator)
frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
texts = [p.strip() for p in prompt.split(",")]
detections_list = []
for text in texts:
_, result = run_florence_inference(
model=self.florence_model,
processor=self.florence_processor,
device=self.device,
image=frame,
task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
text=text
)
detections = sv.Detections.from_lmm(
lmm=sv.LMM.FLORENCE_2,
result=result,
resolution_wh=frame.size
)
detections = run_sam_inference(self.sam_image_model, frame, detections)
detections_list.append(detections)
# Merge detections from all prompts
detections = sv.Detections.merge(detections_list)
detections = run_sam_inference(self.sam_image_model, frame, detections)
# Check if any objects were detected
if len(detections.mask) == 0:
raise ValueError(f"No objects of class {', '.join(texts)} found in the first frame of the video.")
# Generate unique name for video processing
name = generate_unique_name()
# session_path = os.path.join("tmp", name)
# create_directory(session_path)
# frame_directory_path = os.path.join(session_path, "input_frames")
# create_directory(frame_directory_path)
import tempfile
session_path = tempfile.mkdtemp(prefix="video_processing_")
frame_directory_path = tempfile.mkdtemp(prefix="input_frames_", dir=session_path)
output_directory_path = tempfile.mkdtemp(prefix="output_frames_", dir=session_path)
frames_sink = sv.ImageSink(
target_dir_path=frame_directory_path,
image_name_pattern="{:05d}.jpeg"
)
# Get video info and scale
video_info = sv.VideoInfo.from_video_path(video_path)
video_info.width = int(video_info.width * self.scale_factor)
video_info.height = int(video_info.height * self.scale_factor)
# Split video into frames
frames_generator = sv.get_video_frames_generator(video_path)
with frames_sink:
for frame in tqdm(frames_generator, total=video_info.total_frames, desc="Splitting video into frames"):
frame = sv.scale_image(frame, self.scale_factor)
frames_sink.save_image(frame)
# Initialize SAM video model state
inference_state = self.sam_video_model.init_state(
video_path=frame_directory_path,
device=self.device
)
# Add masks to inference state
for mask_index, mask in enumerate(detections.mask):
_, _, _ = self.sam_video_model.add_new_mask(
inference_state=inference_state,
frame_idx=0,
obj_id=mask_index,
mask=mask
)
# Create output video path
# output_video_path = os.path.join("tmp", f"{name}.mp4")
output_video_path = os.path.join(session_path, f"{name}.mp4")
frames_generator = sv.get_video_frames_generator(video_path)
masks_generator = self.sam_video_model.propagate_in_video(inference_state)
# Process and annotate each frame
with sv.VideoSink(output_video_path, video_info=video_info) as sink:
for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
frame = sv.scale_image(frame, self.scale_factor)
masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
if len(masks.shape) == 4:
masks = np.squeeze(masks, axis=1)
detections = sv.Detections(
xyxy=sv.mask_to_xyxy(masks=masks),
mask=masks,
class_id=np.array(tracker_ids)
)
annotated_frame = frame.copy()
annotated_frame[:, :, :] = 0
annotated_frame = self.mask_annotator.annotate(
scene=annotated_frame, detections=detections
)
annotated_frame = (annotated_frame > 0).astype(np.uint8) * 255
sink.write_frame(annotated_frame)
return output_video_path, session_path, frame_directory_path, output_directory_path
#Example usage
# output_video = video_processor.process_video(
# video_path="videos/clip-07-camera-2.mp4",
# scale_factor=0.5,
# prompt="players, basketball, rim, players shadow"
# )
# print(f"Processed video saved at: {output_video}") |