{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import os\n", "# import numpy as np\n", "# from PIL import Image\n", "\n", "# def load_images_from_folder(folder, image_type):\n", "# images = []\n", "# for filename in sorted(os.listdir(folder)):\n", "# img_path = os.path.join(folder, filename)\n", "# if os.path.isfile(img_path):\n", "# img = Image.open(img_path)\n", "\n", "# # Resize the image to make it divisble by 8 but keep the aspect ratio same.\n", "# width, height = img.size\n", "# new_width = (width//8)*8\n", "# new_height = (height//8)*8\n", "# img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)\n", "# if image_type == 'mask':\n", "# img = img.convert('L')\n", "# img_array = np.array(img)\n", "# if image_type == 'mask':\n", "# img_array = np.expand_dims(img_array, axis=-1)\n", "# images.append(img_array)\n", "# return np.array(images)\n", "\n", "# input_frames = 'input_frames' # Directory for video frames\n", "# input_masks = 'output_frames' # Directory for mask frames\n", "\n", "# # Load video frames\n", "# video_sequence = load_images_from_folder(input_frames, 'video')\n", "# # Load mask frames\n", "# mask_sequence = load_images_from_folder(input_masks, 'mask')\n", "\n", "# # Save as .npy files\n", "# np.save('images.npy', video_sequence)\n", "# np.save('masks.npy', mask_sequence)\n", "\n", "# print(\"Video sequence and mask sequence have been saved as .npy files.\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Video sequence shape: (12, 360, 640, 3)\n", "Mask sequence shape: (12, 360, 640, 1)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# # load .npy file and check the images and there dimenstions\n", "\n", "# import os\n", "# import numpy as np\n", "# import matplotlib.pyplot as plt\n", "\n", "# # Load the .npy files\n", "# video_sequence = np.load('images.npy')\n", "# mask_sequence = np.load('masks.npy')\n", "\n", "# # Check the dimensions of the video sequence and mask sequence\n", "# print('Video sequence shape:', video_sequence.shape)\n", "# print('Mask sequence shape:', mask_sequence.shape)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FPS of the video: 59.94005994005994\n" ] } ], "source": [ "import cv2, os\n", "os.makedirs(\"input_frames\", exist_ok=True)\n", "\n", "# Video input\n", "VIDEO_INPUT = \"videos/clip-07-camera-2.mp4\"\n", "\n", "# Video Scale Factor\n", "VIDEO_SCALE_FACTOR = 0.5\n", "\n", "# open the video file\n", "cap = cv2.VideoCapture(VIDEO_INPUT)\n", "\n", "# Get FPS of the video\n", "fps = cap.get(cv2.CAP_PROP_FPS)\n", "print(f\"FPS of the video: {fps}\")\n", "\n", "# get the video frame width and height\n", "frame_width = int(cap.get(3) * VIDEO_SCALE_FACTOR)\n", "frame_height = int(cap.get(4) * VIDEO_SCALE_FACTOR)\n", "\n", "# Now save all the frames to input_frames folder\n", "count = 0\n", "while True:\n", " ret, frame = cap.read()\n", " if not ret:\n", " break\n", " frame = cv2.resize(frame, (frame_width, frame_height))\n", " cv2.imwrite(f\"input_frames/frame_{count:04d}.jpg\", frame)\n", " count += 1" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n" ] } ], "source": [ "import os\n", "import cv2\n", "import numpy as np\n", "import torch\n", "from PIL import Image\n", "from tqdm import tqdm\n", "\n", "import supervision as sv\n", "from utils.video import generate_unique_name, create_directory, delete_directory\n", "from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK\n", "from utils.sam import load_sam_image_model, load_sam_video_model, run_sam_inference\n", "\n", "# Constants\n", "VIDEO_INPUT = \"videos/clip-07-camera-2.mp4\"\n", "TEXT_INPUT = \"players, basketball, rim, players shadow\"\n", "VIDEO_SCALE_FACTOR = 0.5\n", "VIDEO_TARGET_DIRECTORY = \"tmp\"\n", "\n", "# Create target directory\n", "create_directory(directory_path=VIDEO_TARGET_DIRECTORY)\n", "\n", "# Set device\n", "DEVICE = torch.device(\"cuda\")\n", "# DEVICE = torch.device(\"cpu\")\n", "\n", "# Enable mixed precision and TF32 for Ampere GPUs\n", "torch.autocast(device_type=\"cuda\", dtype=torch.bfloat16).__enter__()\n", "if torch.cuda.get_device_properties(0).major >= 8:\n", " torch.backends.cuda.matmul.allow_tf32 = True\n", " torch.backends.cudnn.allow_tf32 = True\n", "\n", "# Load models\n", "FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)\n", "SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)\n", "SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Load the first frame of the video\n", "frame_generator = sv.get_video_frames_generator(VIDEO_INPUT)\n", "frame = next(frame_generator)\n", "frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Process text input\n", "texts = [prompt.strip() for prompt in TEXT_INPUT.split(\",\")]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "detections_list = []\n", "for text in texts:\n", " _, result = run_florence_inference(\n", " model=FLORENCE_MODEL,\n", " processor=FLORENCE_PROCESSOR,\n", " device=DEVICE,\n", " image=frame,\n", " task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,\n", " text=text\n", " )\n", " detections = sv.Detections.from_lmm(\n", " lmm=sv.LMM.FLORENCE_2,\n", " result=result,\n", " resolution_wh=frame.size\n", " )\n", " detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)\n", " detections_list.append(detections)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "detections = sv.Detections.merge(detections_list)\n", "detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Generate unique name for video processing\n", " name = generate_unique_name()\n", " frame_directory_path = os.path.join(\"tmp\", name)\n", " create_directory(frame_directory_path)\n", " frames_sink = sv.ImageSink(\n", " target_dir_path=frame_directory_path,\n", " image_name_pattern=\"{:05d}.jpeg\"\n", " )\n", " # Get video info and scale\n", " video_info = sv.VideoInfo.from_video_path(video_path)\n", " video_info.width = int(video_info.width * self.scale_factor)\n", " video_info.height = int(video_info.height * self.scale_factor)\n", "\n", " # Initialize SAM video model state\n", " inference_state = self.sam_video_model.init_state(\n", " video_path=frame_directory_path,\n", " device=self.device\n", " )" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Splitting video into frames: 5%|▍ | 18/397 [00:00<00:03, 100.21it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Splitting video into frames: 100%|██████████| 397/397 [00:02<00:00, 192.86it/s]\n", "frame loading (JPEG): 100%|██████████| 397/397 [00:14<00:00, 27.67it/s]\n" ] } ], "source": [ "# Generate unique name for video processing\n", "name = generate_unique_name()\n", "frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)\n", "frames_sink = sv.ImageSink(\n", " target_dir_path=frame_directory_path,\n", " image_name_pattern=\"{:05d}.jpeg\"\n", ")\n", "\n", "# Get video info and scale\n", "video_info = sv.VideoInfo.from_video_path(VIDEO_INPUT)\n", "video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)\n", "video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)\n", "\n", "# Split video into frames\n", "frames_generator = sv.get_video_frames_generator(VIDEO_INPUT)\n", "with frames_sink:\n", " for frame in tqdm(frames_generator, total=video_info.total_frames, desc=\"Splitting video into frames\"):\n", " frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)\n", " frames_sink.save_image(frame)\n", "\n", "# Initialize SAM video model\n", "inference_state = SAM_VIDEO_MODEL.init_state(\n", " video_path=frame_directory_path,\n", " device=DEVICE\n", ")\n", "\n", "# Add masks to inference state\n", "for mask_index, mask in enumerate(detections.mask):\n", " _, object_ids, mask_logits = SAM_VIDEO_MODEL.add_new_mask(\n", " inference_state=inference_state,\n", " frame_idx=0,\n", " obj_id=mask_index,\n", " mask=mask\n", " )\n", "\n", "# Create output video path\n", "video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f\"{name}.mp4\")\n", "frames_generator = sv.get_video_frames_generator(VIDEO_INPUT)\n", "masks_generator = SAM_VIDEO_MODEL.propagate_in_video(inference_state)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "COLORS = ['#FFFFFF']\n", "\n", "COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)\n", "\n", "MASK_ANNOTATOR = sv.MaskAnnotator(\n", " color=COLOR_PALETTE,\n", " color_lookup=sv.ColorLookup.INDEX\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "propagate in video: 1%| | 4/397 [00:00<00:25, 15.67it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "propagate in video: 100%|█████████▉| 396/397 [00:32<00:00, 12.26it/s]" ] } ], "source": [ "counter = 0\n", "with sv.VideoSink(video_path, video_info=video_info) as sink:\n", " for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):\n", " frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)\n", " masks = (mask_logits > 0.0).cpu().numpy().astype(bool)\n", " if len(masks.shape) == 4:\n", " masks = np.squeeze(masks, axis=1)\n", "\n", " detections = sv.Detections(\n", " xyxy=sv.mask_to_xyxy(masks=masks),\n", " mask=masks,\n", " class_id=np.array(tracker_ids)\n", " )\n", " # create a black image with same size as original frame\n", " annotated_frame = frame.copy()\n", " # make all pixels of annotated_frame black\n", " annotated_frame[:, :, :] = 0\n", " annotated_frame = MASK_ANNOTATOR.annotate(\n", " scene=annotated_frame, detections=detections)\n", " annotated_frame = (annotated_frame > 0).astype(np.uint8) * 255\n", " # Image.fromarray(annotated_frame).save(f\"output_frames/{counter}.jpeg\")\n", " counter += 1\n", " sink.write_frame(annotated_frame)\n", "\n", "delete_directory(frame_directory_path)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "delete_directory(\"input_frames\")\n", "delete_directory(\"output_frames\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FPS of the video: 59.94005994005994\n" ] } ], "source": [ "import cv2, os\n", "os.makedirs(\"input_frames\", exist_ok=True)\n", "\n", "# open the video file\n", "cap = cv2.VideoCapture(VIDEO_INPUT)\n", "\n", "# Get FPS of the video\n", "fps = cap.get(cv2.CAP_PROP_FPS)\n", "print(f\"FPS of the video: {fps}\")\n", "\n", "# get the video frame width and height\n", "frame_width = int(cap.get(3) * VIDEO_SCALE_FACTOR)\n", "frame_height = int(cap.get(4) * VIDEO_SCALE_FACTOR)\n", "\n", "# Now save all the frames to input_frames folder\n", "count = 0\n", "while True:\n", " ret, frame = cap.read()\n", " if not ret:\n", " break\n", " frame = cv2.resize(frame, (frame_width, frame_height))\n", " cv2.imwrite(f\"input_frames/frame_{count:04d}.jpg\", frame)\n", " count += 1" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import cv2, os\n", "os.makedirs(\"output_frames\", exist_ok=True)\n", "\n", "# Get FPS of the video\n", "fps = video_info.fps\n", "\n", "# get the video frame width and height\n", "frame_width = video_info.width\n", "frame_height = video_info.height\n", "\n", "# open the video file\n", "cap = cv2.VideoCapture(video_path)\n", "\n", "# Now save all the frames to output_frames folder\n", "count = 0\n", "while True:\n", " ret, frame = cap.read()\n", " if not ret:\n", " break\n", " frame = cv2.resize(frame, (frame_width, frame_height))\n", " cv2.imwrite(f\"output_frames/frame_{count:04d}.jpg\", frame)\n", " count += 1" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# # Create video sink and write annotated frames\n", "# counter = 0\n", "# with sv.VideoSink(video_path, video_info=video_info) as sink:\n", "# for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):\n", "# frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)\n", "# masks = (mask_logits > 0.0).cpu().numpy().astype(bool)\n", "# if len(masks.shape) == 4:\n", "# masks = np.squeeze(masks, axis=1)\n", "\n", "# # Now combine all masks\n", "# mask = np.zeros((frame.shape[0], frame.shape[1], 3), dtype=np.uint8)\n", "# for individual_mask in masks:\n", "# mask[individual_mask] = 255\n", "\n", "# Image.fromarray(mask).save(f\"output_frames/{counter}.jpeg\")\n", "# counter += 1" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "propagate in video: 100%|█████████▉| 396/397 [00:52<00:00, 12.26it/s]" ] } ], "source": [ "# import cv2\n", "# import numpy as np\n", "# import os\n", "\n", "# # input frames (RGB video frames)\n", "# input_frames = \"input_frames\"\n", "\n", "# # output frames (Mask frames)\n", "# output_frames = \"output_frames\"\n", "\n", "# # output video\n", "# output_video = \"output_video.mp4\"\n", "\n", "# # Load the input frames\n", "# input_frame_files = sorted(os.listdir(input_frames))\n", "# input_frames = [cv2.imread(os.path.join(input_frames, file)) for file in input_frame_files]\n", "\n", "# # Load the mask frames\n", "# mask_frame_files = sorted(os.listdir(output_frames))\n", "# mask_frames = [cv2.imread(os.path.join(output_frames, file)) for file in mask_frame_files]\n", "\n", "# fps = 60\n", "\n", "# # New based on each masked frame replaced the masked area of the input frame with the mask frame.\n", "# fourcc = cv2.VideoWriter_fourcc(*'avc1')\n", "\n", "# # Get the height and width of the frames\n", "# height, width, _ = input_frames[0].shape\n", "\n", "# # Create the output video writer\n", "# out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))\n", "\n", "# # Iterate over each frame\n", "# for i in range(len(input_frames)):\n", "# # Get the input frame and mask frame\n", "# input_frame = input_frames[i]\n", "# mask_frame = mask_frames[i]\n", "\n", "# # Replace the masked area of the input frame with the mask frame\n", "# masked_frame = input_frame.copy()\n", "# masked_frame[mask_frame == 255] = mask_frame[mask_frame == 255]\n", "\n", "# # Write the frame to the output video\n", "# out.write(masked_frame)\n", "\n", "# # Release the video writer\n", "# out.release()" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", "Splitting video into frames: 100%|██████████| 397/397 [00:02<00:00, 195.57it/s]\n", "frame loading (JPEG): 100%|██████████| 397/397 [00:14<00:00, 26.87it/s]\n", "propagate in video: 100%|█████████▉| 396/397 [00:32<00:00, 12.22it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processed video saved at: tmp/20240827202744_da1ab9da-7b6a-4b23-83b8-a1c4474c4d97\n" ] } ], "source": [ "import cv2\n", "import os\n", "import torch\n", "import numpy as np\n", "from PIL import Image\n", "import supervision as sv\n", "from tqdm import tqdm\n", "from utils.video import generate_unique_name, create_directory, delete_directory\n", "from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK\n", "from utils.sam import load_sam_image_model, load_sam_video_model, run_sam_inference\n", "\n", "\n", "class VideoProcessor:\n", " def __init__(self, device=None):\n", " self.device = device or torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " self._enable_mixed_precision()\n", "\n", " # Load models\n", " self.florence_model, self.florence_processor = load_florence_model(device=self.device)\n", " self.sam_image_model = load_sam_image_model(device=self.device)\n", " self.sam_video_model = load_sam_video_model(device=self.device)\n", "\n", " # Set up mask annotator with a white color palette\n", " self.mask_annotator = sv.MaskAnnotator(\n", " color=sv.ColorPalette.from_hex([\"#FFFFFF\"]),\n", " color_lookup=sv.ColorLookup.INDEX\n", " )\n", "\n", " def _enable_mixed_precision(self):\n", " torch.autocast(device_type=self.device.type, dtype=torch.bfloat16).__enter__()\n", " if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:\n", " torch.backends.cuda.matmul.allow_tf32 = True\n", " torch.backends.cudnn.allow_tf32 = True\n", "\n", " def process_video(self, video_path, scale_factor, prompt):\n", " self.scale_factor = scale_factor\n", "\n", " # Process video based on the prompt\n", " output_video_path, session_path = self._process_prompt(video_path, prompt)\n", "\n", " # Create frames from the output video\n", " self._create_frames(output_video_path, os.path.join(session_path, \"output_frames\"))\n", " \n", " # Delete the output video\n", " os.remove(output_video_path)\n", "\n", " return session_path\n", "\n", " def _create_frames(self, video_path, output_dir):\n", " create_directory(output_dir)\n", " # get the video frame width and height\n", " cap = cv2.VideoCapture(video_path)\n", " frame_width = int(cap.get(3))\n", " frame_height = int(cap.get(4))\n", "\n", " # open the video file\n", " cap = cv2.VideoCapture(video_path)\n", "\n", " # Now save all the frames to output_frames folder\n", " count = 0\n", " while True:\n", " ret, frame = cap.read()\n", " if not ret:\n", " break\n", " frame = cv2.resize(frame, (frame_width, frame_height))\n", " cv2.imwrite(f\"{output_dir}/frame_{count:04d}.jpg\", frame)\n", " count += 1\n", "\n", "\n", " def _process_prompt(self, video_path, prompt):\n", " # Process the first frame with the prompt using the loaded models\n", " frame_generator = sv.get_video_frames_generator(video_path)\n", " frame = next(frame_generator)\n", " frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n", " \n", " texts = [p.strip() for p in prompt.split(\",\")]\n", " detections_list = []\n", "\n", " for text in texts:\n", " _, result = run_florence_inference(\n", " model=self.florence_model,\n", " processor=self.florence_processor,\n", " device=self.device,\n", " image=frame,\n", " task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,\n", " text=text\n", " )\n", " detections = sv.Detections.from_lmm(\n", " lmm=sv.LMM.FLORENCE_2,\n", " result=result,\n", " resolution_wh=frame.size\n", " )\n", " detections = run_sam_inference(self.sam_image_model, frame, detections)\n", " detections_list.append(detections)\n", "\n", " # Merge detections from all prompts\n", " detections = sv.Detections.merge(detections_list)\n", " detections = run_sam_inference(self.sam_image_model, frame, detections)\n", "\n", " # Check if any objects were detected\n", " if len(detections.mask) == 0:\n", " raise ValueError(f\"No objects of class {', '.join(texts)} found in the first frame of the video.\")\n", "\n", " # Generate unique name for video processing\n", " name = generate_unique_name()\n", " session_path = os.path.join(\"tmp\", name)\n", " frame_directory_path = os.path.join(session_path, \"input_frames\")\n", " create_directory(frame_directory_path)\n", "\n", " frames_sink = sv.ImageSink(\n", " target_dir_path=frame_directory_path,\n", " image_name_pattern=\"{:05d}.jpeg\"\n", " )\n", "\n", " # Get video info and scale\n", " video_info = sv.VideoInfo.from_video_path(video_path)\n", " video_info.width = int(video_info.width * self.scale_factor)\n", " video_info.height = int(video_info.height * self.scale_factor)\n", "\n", " # Split video into frames\n", " frames_generator = sv.get_video_frames_generator(video_path)\n", " with frames_sink:\n", " for frame in tqdm(frames_generator, total=video_info.total_frames, desc=\"Splitting video into frames\"):\n", " frame = sv.scale_image(frame, self.scale_factor)\n", " frames_sink.save_image(frame)\n", "\n", " # Initialize SAM video model state\n", " inference_state = self.sam_video_model.init_state(\n", " video_path=frame_directory_path,\n", " device=self.device\n", " )\n", "\n", " # Add masks to inference state\n", " for mask_index, mask in enumerate(detections.mask):\n", " _, _, _ = self.sam_video_model.add_new_mask(\n", " inference_state=inference_state,\n", " frame_idx=0,\n", " obj_id=mask_index,\n", " mask=mask\n", " )\n", "\n", " # Create output video path\n", " output_video_path = os.path.join(\"tmp\", f\"{name}.mp4\")\n", " frames_generator = sv.get_video_frames_generator(video_path)\n", " masks_generator = self.sam_video_model.propagate_in_video(inference_state)\n", "\n", " # Process and annotate each frame\n", " with sv.VideoSink(output_video_path, video_info=video_info) as sink:\n", " for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):\n", " frame = sv.scale_image(frame, self.scale_factor)\n", " masks = (mask_logits > 0.0).cpu().numpy().astype(bool)\n", " if len(masks.shape) == 4:\n", " masks = np.squeeze(masks, axis=1)\n", "\n", " detections = sv.Detections(\n", " xyxy=sv.mask_to_xyxy(masks=masks),\n", " mask=masks,\n", " class_id=np.array(tracker_ids)\n", " )\n", "\n", " annotated_frame = frame.copy()\n", "\n", " annotated_frame[:, :, :] = 0\n", " \n", " annotated_frame = self.mask_annotator.annotate(\n", " scene=annotated_frame, detections=detections\n", " )\n", " annotated_frame = (annotated_frame > 0).astype(np.uint8) * 255\n", " sink.write_frame(annotated_frame)\n", "\n", " return output_video_path, session_path\n", "\n", "\n", "# Example usage:\n", "video_processor = VideoProcessor()\n", "output_video = video_processor.process_video(\n", " video_path=\"videos/clip-07-camera-2.mp4\", \n", " scale_factor=0.5, \n", " prompt=\"players, basketball, rim, players shadow\"\n", ")\n", "print(f\"Processed video saved at: {output_video}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "vor", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }