Spaces:
Configuration error
Configuration error
import cv2 | |
import os | |
import torch | |
import numpy as np | |
from PIL import Image | |
import supervision as sv | |
from tqdm import tqdm | |
from utils.video import generate_unique_name, create_directory, delete_directory | |
from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK | |
from utils.sam import load_sam_image_model, load_sam_video_model, run_sam_inference | |
class VideoProcessor: | |
def __init__(self, device=None): | |
self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.autocast = None | |
self.florence_model = None | |
self.florence_processor = None | |
self.sam_image_model = None | |
self.sam_video_model = None | |
# Set up mask annotator with a white color palette | |
self.mask_annotator = sv.MaskAnnotator( | |
color=sv.ColorPalette.from_hex(["#FFFFFF"]), | |
color_lookup=sv.ColorLookup.INDEX | |
) | |
def _load_models(self): | |
# Load models | |
self.florence_model, self.florence_processor = load_florence_model(device=self.device) | |
self.sam_image_model = load_sam_image_model(device=self.device) | |
self.sam_video_model = load_sam_video_model(device=self.device) | |
def _enable_mixed_precision(self): | |
self.autocast = torch.autocast(device_type=self.device.type, dtype=torch.bfloat16) | |
self.autocast.__enter__() | |
if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8: | |
torch.backends.cuda.matmul.allow_tf32 = True | |
torch.backends.cudnn.allow_tf32 = True | |
def _reset_mixed_precision(self): | |
# Exit the autocast context | |
self.autocast.__exit__(None, None, None) | |
# Reset CUDA settings | |
if torch.cuda.is_available(): | |
torch.backends.cuda.matmul.allow_tf32 = False | |
torch.backends.cudnn.allow_tf32 = False | |
torch.cuda.empty_cache() # Clear the CUDA cache | |
def process_video(self, video_path, scale_factor, prompt): | |
self.scale_factor = scale_factor | |
# Process video based on the prompt | |
output_video_path, session_path, input_frames_dir, output_directory_path = self._process_prompt(video_path, prompt) | |
# Create frames from the output video | |
fps = self._create_frames(output_video_path, output_directory_path) | |
# Delete the output video | |
os.remove(output_video_path) | |
return session_path, fps, input_frames_dir, output_directory_path | |
def _create_frames(self, video_path, output_dir): | |
create_directory(output_dir) | |
# get the video frame width and height | |
cap = cv2.VideoCapture(video_path) | |
frame_width = int(cap.get(3)) | |
frame_height = int(cap.get(4)) | |
fps = cap.get(cv2.CAP_PROP_FPS) | |
# open the video file | |
cap = cv2.VideoCapture(video_path) | |
# Now save all the frames to output_frames folder | |
count = 0 | |
while True: | |
ret, frame = cap.read() | |
if not ret: | |
break | |
frame = cv2.resize(frame, (frame_width, frame_height)) | |
cv2.imwrite(f"{output_dir}/frame_{count:04d}.jpg", frame) | |
count += 1 | |
return fps | |
def _process_prompt(self, video_path, prompt): | |
# Process the first frame with the prompt using the loaded models | |
frame_generator = sv.get_video_frames_generator(video_path) | |
frame = next(frame_generator) | |
frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
texts = [p.strip() for p in prompt.split(",")] | |
detections_list = [] | |
for text in texts: | |
_, result = run_florence_inference( | |
model=self.florence_model, | |
processor=self.florence_processor, | |
device=self.device, | |
image=frame, | |
task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK, | |
text=text | |
) | |
detections = sv.Detections.from_lmm( | |
lmm=sv.LMM.FLORENCE_2, | |
result=result, | |
resolution_wh=frame.size | |
) | |
detections = run_sam_inference(self.sam_image_model, frame, detections) | |
detections_list.append(detections) | |
# Merge detections from all prompts | |
detections = sv.Detections.merge(detections_list) | |
detections = run_sam_inference(self.sam_image_model, frame, detections) | |
# Check if any objects were detected | |
if len(detections.mask) == 0: | |
raise ValueError(f"No objects of class {', '.join(texts)} found in the first frame of the video.") | |
# Generate unique name for video processing | |
name = generate_unique_name() | |
# session_path = os.path.join("tmp", name) | |
# create_directory(session_path) | |
# frame_directory_path = os.path.join(session_path, "input_frames") | |
# create_directory(frame_directory_path) | |
import tempfile | |
session_path = tempfile.mkdtemp(prefix="video_processing_") | |
frame_directory_path = tempfile.mkdtemp(prefix="input_frames_", dir=session_path) | |
output_directory_path = tempfile.mkdtemp(prefix="output_frames_", dir=session_path) | |
frames_sink = sv.ImageSink( | |
target_dir_path=frame_directory_path, | |
image_name_pattern="{:05d}.jpeg" | |
) | |
# Get video info and scale | |
video_info = sv.VideoInfo.from_video_path(video_path) | |
video_info.width = int(video_info.width * self.scale_factor) | |
video_info.height = int(video_info.height * self.scale_factor) | |
# Split video into frames | |
frames_generator = sv.get_video_frames_generator(video_path) | |
with frames_sink: | |
for frame in tqdm(frames_generator, total=video_info.total_frames, desc="Splitting video into frames"): | |
frame = sv.scale_image(frame, self.scale_factor) | |
frames_sink.save_image(frame) | |
# Initialize SAM video model state | |
inference_state = self.sam_video_model.init_state( | |
video_path=frame_directory_path, | |
device=self.device | |
) | |
# Add masks to inference state | |
for mask_index, mask in enumerate(detections.mask): | |
_, _, _ = self.sam_video_model.add_new_mask( | |
inference_state=inference_state, | |
frame_idx=0, | |
obj_id=mask_index, | |
mask=mask | |
) | |
# Create output video path | |
# output_video_path = os.path.join("tmp", f"{name}.mp4") | |
output_video_path = os.path.join(session_path, f"{name}.mp4") | |
frames_generator = sv.get_video_frames_generator(video_path) | |
masks_generator = self.sam_video_model.propagate_in_video(inference_state) | |
# Process and annotate each frame | |
with sv.VideoSink(output_video_path, video_info=video_info) as sink: | |
for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator): | |
frame = sv.scale_image(frame, self.scale_factor) | |
masks = (mask_logits > 0.0).cpu().numpy().astype(bool) | |
if len(masks.shape) == 4: | |
masks = np.squeeze(masks, axis=1) | |
detections = sv.Detections( | |
xyxy=sv.mask_to_xyxy(masks=masks), | |
mask=masks, | |
class_id=np.array(tracker_ids) | |
) | |
annotated_frame = frame.copy() | |
annotated_frame[:, :, :] = 0 | |
annotated_frame = self.mask_annotator.annotate( | |
scene=annotated_frame, detections=detections | |
) | |
annotated_frame = (annotated_frame > 0).astype(np.uint8) * 255 | |
sink.write_frame(annotated_frame) | |
return output_video_path, session_path, frame_directory_path, output_directory_path | |
#Example usage | |
# output_video = video_processor.process_video( | |
# video_path="videos/clip-07-camera-2.mp4", | |
# scale_factor=0.5, | |
# prompt="players, basketball, rim, players shadow" | |
# ) | |
# print(f"Processed video saved at: {output_video}") |