import zipfile import os import chromadb from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction from chromadb.utils.data_loaders import ImageLoader import cv2 path = "mm_vdb2" client = chromadb.PersistentClient(path=path) image_loader = ImageLoader() CLIP = OpenCLIPEmbeddingFunction() video_collection = client.get_or_create_collection( name='video_collection', embedding_function=CLIP, data_loader=image_loader ) def extract_frames(video_folder, output_folder): if not os.path.exists(output_folder): os.makedirs(output_folder) for video_filename in os.listdir(video_folder): if video_filename.endswith('.mp4'): video_path = os.path.join(video_folder, video_filename) video_capture = cv2.VideoCapture(video_path) fps = video_capture.get(cv2.CAP_PROP_FPS) frame_count = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT)) duration = frame_count / fps output_subfolder = os.path.join(output_folder, os.path.splitext(video_filename)[0]) if not os.path.exists(output_subfolder): os.makedirs(output_subfolder) success, image = video_capture.read() frame_number = 0 while success: if frame_number == 0 or frame_number % int(fps * 5) == 0 or frame_number == frame_count - 1: frame_time = frame_number / fps output_frame_filename = os.path.join(output_subfolder, f'frame_{int(frame_time)}.jpg') cv2.imwrite(output_frame_filename, image) success, image = video_capture.read() frame_number += 1 video_capture.release() def add_frames_to_chromadb(video_dir, frames_dir): video_frames = {} for video_file in os.listdir(video_dir): if video_file.endswith('.mp4'): video_title = video_file[:-4] frame_folder = os.path.join(frames_dir, video_title) if os.path.exists(frame_folder): video_frames[video_title] = [f for f in os.listdir(frame_folder) if f.endswith('.jpg')] ids = [] uris = [] metadatas = [] for video_title, frames in video_frames.items(): video_path = os.path.join(video_dir, f"{video_title}.mp4") for frame in frames: frame_id = f"{frame[:-4]}_{video_title}" frame_path = os.path.join(frames_dir, video_title, frame) ids.append(frame_id) uris.append(frame_path) metadatas.append({'video_uri': video_path}) video_collection.add(ids=ids, uris=uris, metadatas=metadatas) def process_video_files(video_paths): frames_output_folder = r"extracted_frames" os.makedirs(frames_output_folder, exist_ok=True) for video_path in video_paths: video_folder = os.path.dirname(video_path) extract_frames(video_folder, frames_output_folder) add_frames_to_chromadb(video_folder, frames_output_folder) return video_collection # # Example usage # video_paths = [ # "/path/to/video1.mp4", # "/path/to/video2.mp4", # "/path/to/video3.mp4" # ] # process_video_files(video_paths)