In [1]:
# import os
# import numpy as np
# from PIL import Image

# def load_images_from_folder(folder, image_type):
# images = []
# for filename in sorted(os.listdir(folder)):
# img_path = os.path.join(folder, filename)
# if os.path.isfile(img_path):
# img = Image.open(img_path)

# # Resize the image to make it divisble by 8 but keep the aspect ratio same.
# width, height = img.size
# new_width = (width//8)*8
# new_height = (height//8)*8
# img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
# if image_type == 'mask':
# img = img.convert('L')
# img_array = np.array(img)
# if image_type == 'mask':
# img_array = np.expand_dims(img_array, axis=-1)
# images.append(img_array)
# return np.array(images)

# input_frames = 'input_frames' # Directory for video frames
# input_masks = 'output_frames' # Directory for mask frames

# # Load video frames
# video_sequence = load_images_from_folder(input_frames, 'video')
# # Load mask frames
# mask_sequence = load_images_from_folder(input_masks, 'mask')

# # Save as .npy files
# np.save('images.npy', video_sequence)
# np.save('masks.npy', mask_sequence)

# print("Video sequence and mask sequence have been saved as .npy files.")


Video sequence shape: (12, 360, 640, 3)
Mask sequence shape: (12, 360, 640, 1)

In [2]:
# # load .npy file and check the images and there dimenstions

# import os
# import numpy as np
# import matplotlib.pyplot as plt

# # Load the .npy files
# video_sequence = np.load('images.npy')
# mask_sequence = np.load('masks.npy')

# # Check the dimensions of the video sequence and mask sequence
# print('Video sequence shape:', video_sequence.shape)
# print('Mask sequence shape:', mask_sequence.shape)

In [3]:
import cv2, os
os.makedirs("input_frames", exist_ok=True)

# Video input
VIDEO_INPUT = "videos/clip-07-camera-2.mp4"

# Video Scale Factor
VIDEO_SCALE_FACTOR = 0.5

# open the video file
cap = cv2.VideoCapture(VIDEO_INPUT)

# Get FPS of the video
fps = cap.get(cv2.CAP_PROP_FPS)
print(f"FPS of the video: {fps}")

# get the video frame width and height
frame_width = int(cap.get(3) * VIDEO_SCALE_FACTOR)
frame_height = int(cap.get(4) * VIDEO_SCALE_FACTOR)

# Now save all the frames to input_frames folder
count = 0
while True:
 ret, frame = cap.read()
 if not ret:
 break
 frame = cv2.resize(frame, (frame_width, frame_height))
 cv2.imwrite(f"input_frames/frame_{count:04d}.jpg", frame)
 count += 1

FPS of the video: 59.94005994005994


In [4]:
import os
import cv2
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm

import supervision as sv
from utils.video import generate_unique_name, create_directory, delete_directory
from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
from utils.sam import load_sam_image_model, load_sam_video_model, run_sam_inference

# Constants
VIDEO_INPUT = "videos/clip-07-camera-2.mp4"
TEXT_INPUT = "players, basketball, rim, players shadow"
VIDEO_SCALE_FACTOR = 0.5
VIDEO_TARGET_DIRECTORY = "tmp"

# Create target directory
create_directory(directory_path=VIDEO_TARGET_DIRECTORY)

# Set device
DEVICE = torch.device("cuda")
# DEVICE = torch.device("cpu")

# Enable mixed precision and TF32 for Ampere GPUs
torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
if torch.cuda.get_device_properties(0).major >= 8:
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True

# Load models
FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)



In [5]:
# Load the first frame of the video
frame_generator = sv.get_video_frames_generator(VIDEO_INPUT)
frame = next(frame_generator)
frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

In [6]:
# Process text input
texts = [prompt.strip() for prompt in TEXT_INPUT.split(",")]

In [7]:
detections_list = []
for text in texts:
 _, result = run_florence_inference(
 model=FLORENCE_MODEL,
 processor=FLORENCE_PROCESSOR,
 device=DEVICE,
 image=frame,
 task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
 text=text
 )
 detections = sv.Detections.from_lmm(
 lmm=sv.LMM.FLORENCE_2,
 result=result,
 resolution_wh=frame.size
 )
 detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
 detections_list.append(detections)

In [8]:
detections = sv.Detections.merge(detections_list)
detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)

In [None]:
# Generate unique name for video processing
 name = generate_unique_name()
 frame_directory_path = os.path.join("tmp", name)
 create_directory(frame_directory_path)
 frames_sink = sv.ImageSink(
 target_dir_path=frame_directory_path,
 image_name_pattern="{:05d}.jpeg"
 )
 # Get video info and scale
 video_info = sv.VideoInfo.from_video_path(video_path)
 video_info.width = int(video_info.width * self.scale_factor)
 video_info.height = int(video_info.height * self.scale_factor)

 # Initialize SAM video model state
 inference_state = self.sam_video_model.init_state(
 video_path=frame_directory_path,
 device=self.device
 )

In [9]:
# Generate unique name for video processing
name = generate_unique_name()
frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
frames_sink = sv.ImageSink(
 target_dir_path=frame_directory_path,
 image_name_pattern="{:05d}.jpeg"
)

# Get video info and scale
video_info = sv.VideoInfo.from_video_path(VIDEO_INPUT)
video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)
video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)

# Split video into frames
frames_generator = sv.get_video_frames_generator(VIDEO_INPUT)
with frames_sink:
 for frame in tqdm(frames_generator, total=video_info.total_frames, desc="Splitting video into frames"):
 frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
 frames_sink.save_image(frame)

# Initialize SAM video model
inference_state = SAM_VIDEO_MODEL.init_state(
 video_path=frame_directory_path,
 device=DEVICE
)

# Add masks to inference state
for mask_index, mask in enumerate(detections.mask):
 _, object_ids, mask_logits = SAM_VIDEO_MODEL.add_new_mask(
 inference_state=inference_state,
 frame_idx=0,
 obj_id=mask_index,
 mask=mask
 )

# Create output video path
video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
frames_generator = sv.get_video_frames_generator(VIDEO_INPUT)
masks_generator = SAM_VIDEO_MODEL.propagate_in_video(inference_state)

Splitting video into frames: 5%|▍ | 18/397 [00:00<00:03, 100.21it/s]

Splitting video into frames: 100%|██████████| 397/397 [00:02<00:00, 192.86it/s]
frame loading (JPEG): 100%|██████████| 397/397 [00:14<00:00, 27.67it/s]


In [10]:
COLORS = ['#FFFFFF']

COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)

MASK_ANNOTATOR = sv.MaskAnnotator(
 color=COLOR_PALETTE,
 color_lookup=sv.ColorLookup.INDEX
)

In [11]:
counter = 0
with sv.VideoSink(video_path, video_info=video_info) as sink:
 for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
 frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
 masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
 if len(masks.shape) == 4:
 masks = np.squeeze(masks, axis=1)

 detections = sv.Detections(
 xyxy=sv.mask_to_xyxy(masks=masks),
 mask=masks,
 class_id=np.array(tracker_ids)
 )
 # create a black image with same size as original frame
 annotated_frame = frame.copy()
 # make all pixels of annotated_frame black
 annotated_frame[:, :, :] = 0
 annotated_frame = MASK_ANNOTATOR.annotate(
 scene=annotated_frame, detections=detections)
 annotated_frame = (annotated_frame > 0).astype(np.uint8) * 255
 # Image.fromarray(annotated_frame).save(f"output_frames/{counter}.jpeg")
 counter += 1
 sink.write_frame(annotated_frame)

delete_directory(frame_directory_path)

propagate in video: 1%| | 4/397 [00:00<00:25, 15.67it/s]

propagate in video: 100%|█████████▉| 396/397 [00:32<00:00, 12.26it/s]

In [12]:
delete_directory("input_frames")
delete_directory("output_frames")

In [13]:
import cv2, os
os.makedirs("input_frames", exist_ok=True)

# open the video file
cap = cv2.VideoCapture(VIDEO_INPUT)

# Get FPS of the video
fps = cap.get(cv2.CAP_PROP_FPS)
print(f"FPS of the video: {fps}")

# get the video frame width and height
frame_width = int(cap.get(3) * VIDEO_SCALE_FACTOR)
frame_height = int(cap.get(4) * VIDEO_SCALE_FACTOR)

# Now save all the frames to input_frames folder
count = 0
while True:
 ret, frame = cap.read()
 if not ret:
 break
 frame = cv2.resize(frame, (frame_width, frame_height))
 cv2.imwrite(f"input_frames/frame_{count:04d}.jpg", frame)
 count += 1

FPS of the video: 59.94005994005994


In [14]:
import cv2, os
os.makedirs("output_frames", exist_ok=True)

# Get FPS of the video
fps = video_info.fps

# get the video frame width and height
frame_width = video_info.width
frame_height = video_info.height

# open the video file
cap = cv2.VideoCapture(video_path)

# Now save all the frames to output_frames folder
count = 0
while True:
 ret, frame = cap.read()
 if not ret:
 break
 frame = cv2.resize(frame, (frame_width, frame_height))
 cv2.imwrite(f"output_frames/frame_{count:04d}.jpg", frame)
 count += 1

In [15]:
# # Create video sink and write annotated frames
# counter = 0
# with sv.VideoSink(video_path, video_info=video_info) as sink:
# for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
# frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
# masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
# if len(masks.shape) == 4:
# masks = np.squeeze(masks, axis=1)

# # Now combine all masks
# mask = np.zeros((frame.shape[0], frame.shape[1], 3), dtype=np.uint8)
# for individual_mask in masks:
# mask[individual_mask] = 255

# Image.fromarray(mask).save(f"output_frames/{counter}.jpeg")
# counter += 1

In [16]:
# import cv2
# import numpy as np
# import os

# # input frames (RGB video frames)
# input_frames = "input_frames"

# # output frames (Mask frames)
# output_frames = "output_frames"

# # output video
# output_video = "output_video.mp4"

# # Load the input frames
# input_frame_files = sorted(os.listdir(input_frames))
# input_frames = [cv2.imread(os.path.join(input_frames, file)) for file in input_frame_files]

# # Load the mask frames
# mask_frame_files = sorted(os.listdir(output_frames))
# mask_frames = [cv2.imread(os.path.join(output_frames, file)) for file in mask_frame_files]

# fps = 60

# # New based on each masked frame replaced the masked area of the input frame with the mask frame.
# fourcc = cv2.VideoWriter_fourcc(*'avc1')

# # Get the height and width of the frames
# height, width, _ = input_frames[0].shape

# # Create the output video writer
# out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# # Iterate over each frame
# for i in range(len(input_frames)):
# # Get the input frame and mask frame
# input_frame = input_frames[i]
# mask_frame = mask_frames[i]

# # Replace the masked area of the input frame with the mask frame
# masked_frame = input_frame.copy()
# masked_frame[mask_frame == 255] = mask_frame[mask_frame == 255]

# # Write the frame to the output video
# out.write(masked_frame)

# # Release the video writer
# out.release()

propagate in video: 100%|█████████▉| 396/397 [00:52<00:00, 12.26it/s]

In [1]:
import cv2
import os
import torch
import numpy as np
from PIL import Image
import supervision as sv
from tqdm import tqdm
from utils.video import generate_unique_name, create_directory, delete_directory
from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
from utils.sam import load_sam_image_model, load_sam_video_model, run_sam_inference


class VideoProcessor:
 def __init__(self, device=None):
 self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
 self._enable_mixed_precision()

 # Load models
 self.florence_model, self.florence_processor = load_florence_model(device=self.device)
 self.sam_image_model = load_sam_image_model(device=self.device)
 self.sam_video_model = load_sam_video_model(device=self.device)

 # Set up mask annotator with a white color palette
 self.mask_annotator = sv.MaskAnnotator(
 color=sv.ColorPalette.from_hex(["#FFFFFF"]),
 color_lookup=sv.ColorLookup.INDEX
 )

 def _enable_mixed_precision(self):
 torch.autocast(device_type=self.device.type, dtype=torch.bfloat16).__enter__()
 if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True

 def process_video(self, video_path, scale_factor, prompt):
 self.scale_factor = scale_factor

 # Process video based on the prompt
 output_video_path, session_path = self._process_prompt(video_path, prompt)

 # Create frames from the output video
 self._create_frames(output_video_path, os.path.join(session_path, "output_frames"))
 
 # Delete the output video
 os.remove(output_video_path)

 return session_path

 def _create_frames(self, video_path, output_dir):
 create_directory(output_dir)
 # get the video frame width and height
 cap = cv2.VideoCapture(video_path)
 frame_width = int(cap.get(3))
 frame_height = int(cap.get(4))

 # open the video file
 cap = cv2.VideoCapture(video_path)

 # Now save all the frames to output_frames folder
 count = 0
 while True:
 ret, frame = cap.read()
 if not ret:
 break
 frame = cv2.resize(frame, (frame_width, frame_height))
 cv2.imwrite(f"{output_dir}/frame_{count:04d}.jpg", frame)
 count += 1


 def _process_prompt(self, video_path, prompt):
 # Process the first frame with the prompt using the loaded models
 frame_generator = sv.get_video_frames_generator(video_path)
 frame = next(frame_generator)
 frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 
 texts = [p.strip() for p in prompt.split(",")]
 detections_list = []

 for text in texts:
 _, result = run_florence_inference(
 model=self.florence_model,
 processor=self.florence_processor,
 device=self.device,
 image=frame,
 task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
 text=text
 )
 detections = sv.Detections.from_lmm(
 lmm=sv.LMM.FLORENCE_2,
 result=result,
 resolution_wh=frame.size
 )
 detections = run_sam_inference(self.sam_image_model, frame, detections)
 detections_list.append(detections)

 # Merge detections from all prompts
 detections = sv.Detections.merge(detections_list)
 detections = run_sam_inference(self.sam_image_model, frame, detections)

 # Check if any objects were detected
 if len(detections.mask) == 0:
 raise ValueError(f"No objects of class {', '.join(texts)} found in the first frame of the video.")

 # Generate unique name for video processing
 name = generate_unique_name()
 session_path = os.path.join("tmp", name)
 frame_directory_path = os.path.join(session_path, "input_frames")
 create_directory(frame_directory_path)

 frames_sink = sv.ImageSink(
 target_dir_path=frame_directory_path,
 image_name_pattern="{:05d}.jpeg"
 )

 # Get video info and scale
 video_info = sv.VideoInfo.from_video_path(video_path)
 video_info.width = int(video_info.width * self.scale_factor)
 video_info.height = int(video_info.height * self.scale_factor)

 # Split video into frames
 frames_generator = sv.get_video_frames_generator(video_path)
 with frames_sink:
 for frame in tqdm(frames_generator, total=video_info.total_frames, desc="Splitting video into frames"):
 frame = sv.scale_image(frame, self.scale_factor)
 frames_sink.save_image(frame)

 # Initialize SAM video model state
 inference_state = self.sam_video_model.init_state(
 video_path=frame_directory_path,
 device=self.device
 )

 # Add masks to inference state
 for mask_index, mask in enumerate(detections.mask):
 _, _, _ = self.sam_video_model.add_new_mask(
 inference_state=inference_state,
 frame_idx=0,
 obj_id=mask_index,
 mask=mask
 )

 # Create output video path
 output_video_path = os.path.join("tmp", f"{name}.mp4")
 frames_generator = sv.get_video_frames_generator(video_path)
 masks_generator = self.sam_video_model.propagate_in_video(inference_state)

 # Process and annotate each frame
 with sv.VideoSink(output_video_path, video_info=video_info) as sink:
 for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
 frame = sv.scale_image(frame, self.scale_factor)
 masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
 if len(masks.shape) == 4:
 masks = np.squeeze(masks, axis=1)

 detections = sv.Detections(
 xyxy=sv.mask_to_xyxy(masks=masks),
 mask=masks,
 class_id=np.array(tracker_ids)
 )

 annotated_frame = frame.copy()

 annotated_frame[:, :, :] = 0
 
 annotated_frame = self.mask_annotator.annotate(
 scene=annotated_frame, detections=detections
 )
 annotated_frame = (annotated_frame > 0).astype(np.uint8) * 255
 sink.write_frame(annotated_frame)

 return output_video_path, session_path


# Example usage:
video_processor = VideoProcessor()
output_video = video_processor.process_video(
 video_path="videos/clip-07-camera-2.mp4", 
 scale_factor=0.5, 
 prompt="players, basketball, rim, players shadow"
)
print(f"Processed video saved at: {output_video}")


Splitting video into frames: 100%|██████████| 397/397 [00:02<00:00, 195.57it/s]
frame loading (JPEG): 100%|██████████| 397/397 [00:14<00:00, 26.87it/s]
propagate in video: 100%|█████████▉| 396/397 [00:32<00:00, 12.22it/s]


Processed video saved at: tmp/20240827202744_da1ab9da-7b6a-4b23-83b8-a1c4474c4d97
