import gradio as gr import os import json import threading from pathlib import Path from moviepy.editor import VideoFileClip import hashlib import random import string from PIL import Image PHYSICAL_LAWS = [ "Violation of Newton's Law: Objects move without any external force.", "Violation of the Law of Conservation of Mass or Solid Constitutive Law: Objects deform or distort irregularly.", "Violation of Fluid Constitutive Law: Liquids flow in an unnatural or irregular manner.", "Violation of Non-physical Penetration: Objects unnaturally pass through each other.", "Violation of Gravity: Objects behave inconsistently with gravity, such as floating in the air.", "No violation!" ] # List of commonsense violations COMMON_SENSE = [ "Poor Aesthetics: Visually unappealing or low-quality content.", "Temporal Inconsistency: Flickering, choppiness, or sudden appearance/disappearance of irrelevant objects.", "No violation!" ] # Example images for physical law violations EXAMPLE_IMAGES = { "newtons_law": "test_images/law_violation1.jpg", "mass_conservation": "test_images/law_violation2.jpg", "fluid.": "test_images/law_violation3.jpg", "penetration": "test_images/law_violation4.jpg", "gravity": "test_images/law_violation5.jpg" } def string_to_md5(input_string, max_digits=12): return hashlib.md5(input_string.encode()).hexdigest()[:max_digits] def generate_random_id(length=6): return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length)) class VideoAnnotator: def __init__(self, videos, annotation_base_dir, max_resolution=(640, 480)): self.annotation_base_dir = Path(annotation_base_dir) self.max_resolution = max_resolution self.videos = videos self.current_index = 0 self.file_locks = {} self.current_labeler = None self.current_labeler_file = None def get_annotation_file_path(self, labeler_email): md5_email = string_to_md5(labeler_email, max_digits=12) # random_id = generate_random_id() # file_name = f"md5-{md5_email}.{random_id}.json" file_name = f"md5-{md5_email}.json" return self.annotation_base_dir / file_name def load_annotations(self, labeler_email): file_path = self.get_annotation_file_path(labeler_email) if file_path.exists(): with open(file_path, 'r') as f: return json.load(f) return {} def save_annotations(self, labeler_email, annotations): file_path = self.get_annotation_file_path(labeler_email) self.annotation_base_dir.mkdir(parents=True, exist_ok=True) if file_path not in self.file_locks: self.file_locks[file_path] = threading.Lock() with self.file_locks[file_path]: with open(file_path, 'w') as f: json.dump(annotations, f, indent=2) def get_current_video(self): if self.videos: video_path = self.videos[self.current_index] resized_path = self.resize_video_if_needed(video_path) return str(resized_path.resolve()) return None def resize_video_if_needed(self, video_path): from moviepy.video.io.ffmpeg_writer import ffmpeg_write_video clip = VideoFileClip(str(video_path)) width, height = clip.size if width > self.max_resolution[0] or height > self.max_resolution[1]: resized_clip = clip.resize(height=self.max_resolution[1]) cleaned_name = video_path.name.replace(" ", "_") resized_path = video_path.with_name(f"resized_{cleaned_name}") fps = clip.fps if clip.fps else 8.0 ffmpeg_write_video(resized_clip, str(resized_path), fps, codec="libx264") return resized_path return video_path def update_annotation(self, video_name, labeler_email, instruction_check, law_annotations, commonsense): video_name = postprocess_name_for_gradio(video_name) annotations = self.load_annotations(labeler_email) if instruction_check and video_name not in annotations: annotations[video_name] = { "labeler": labeler_email, "law_details": law_annotations, "commonsense": commonsense, "instruction": instruction_check } self.save_annotations(labeler_email, annotations) def next_video(self): if self.videos: self.current_index = min(self.current_index + 1, len(self.videos) - 1) return self.get_current_video() def prev_video(self): if self.videos: self.current_index = max(self.current_index - 1, 0) return self.get_current_video() def jump_to_video(self, index): if self.videos: self.current_index = max(0, min(index, len(self.videos) - 1)) return self.get_current_video() def set_current_labeler(self, labeler_email): self.current_labeler = labeler_email self.current_labeler_file = self.get_annotation_file_path(labeler_email) def postprocess_name_for_gradio(name): return name.replace("–","").replace("+","").replace("-","").replace("t2v","").replace("(", "").replace(")","").replace(",","").replace("_","").replace(".","") def get_cur_data(instruction_data, video_name): video_name = postprocess_name_for_gradio(video_name) if "resized_" in video_name: clean_name = video_name.replace("resized_", "") clean_name = "_".join(clean_name.split("_")[2:]) else: clean_name = video_name # print(clean_name, instruction_data.keys()) for k in instruction_data.keys(): if k in clean_name: real_name = k cur_data = instruction_data[real_name] return cur_data def create_interface(instruction_data, videos, annotation_base_dir): annotator = VideoAnnotator(videos, annotation_base_dir) def update_video(): video_path = annotator.get_current_video() if video_path is None: return (None, annotator.current_labeler or "", "[system] Video not in benchmark", "[system] Video not in benchmark", *[False for _ in PHYSICAL_LAWS], *[False for _ in COMMON_SENSE]) video_name = Path(video_path).name cur_data = get_cur_data(instruction_data, video_name) current_annotations = {} if annotator.current_labeler: annotations = annotator.load_annotations(annotator.current_labeler) current_annotations = annotations.get( postprocess_name_for_gradio(video_name), {"labeler": annotator.current_labeler, "law_details": {law: False for law in PHYSICAL_LAWS}, "commonsense": {cs: False for cs in COMMON_SENSE}, "instruction": None} ) else: current_annotations = {"labeler": "", "law_details": {law: False for law in PHYSICAL_LAWS}, "commonsense": {cs: False for cs in COMMON_SENSE},"instruction": None} first_frame = cur_data["text_first_frame"] num_annotations = str(len(annotations)) if 'annotations' in locals() else "0" text_instruction = cur_data["text_instruction"] # Flatten the outputs outputs = [ video_path, current_annotations["labeler"] or "", num_annotations, current_annotations["instruction"], text_instruction ] # Add individual law checkbox values outputs.extend([current_annotations["law_details"].get(law, False) for law in PHYSICAL_LAWS]) # Add individual commonsense checkbox values outputs.extend([current_annotations["commonsense"].get(cs, False) for cs in COMMON_SENSE]) return outputs def save_current_annotation(video_path, labeler_email, instruction_check, law_values, commonsense_values, skipped: bool=False): if not skipped: if video_path is None: return "No video loaded to save annotations." if not labeler_email: return "Please enter a valid labeler email before saving annotations." video_name = Path(video_path).name law_annotations = {law: bool(value) for law, value in zip(PHYSICAL_LAWS, law_values)} commonsense_annotations = {cs: bool(value) for cs, value in zip(COMMON_SENSE, commonsense_values)} annotator.set_current_labeler(labeler_email) annotator.update_annotation(video_name, labeler_email, instruction_check, law_annotations, commonsense_annotations) return f"Annotation saved successfully for {labeler_email}!" else: video_name = Path(video_path).name law_annotations = {law: bool(value) for law, value in zip(PHYSICAL_LAWS, law_values)} commonsense_annotations = {cs: bool(value) for cs, value in zip(COMMON_SENSE, commonsense_values)} annotator.set_current_labeler(labeler_email) annotator.update_annotation(video_name, labeler_email, instruction_check, law_annotations, commonsense_annotations) return f"Annotation saved successfully for {labeler_email}!" def load_anns_callback(labeler_email): """ Load annotations for the given labeler email and jump to the next unlabeled video. Returns the updated interface state. """ if not labeler_email: return update_video() # Set the current labeler annotator.set_current_labeler(labeler_email) # Load existing annotations annotations = annotator.load_annotations(labeler_email) # Find the first video that hasn't been annotated next_unannotated_index = None for i, video in enumerate(annotator.videos): video_name = postprocess_name_for_gradio("resized_" + Path(video).name) if video_name not in annotations: next_unannotated_index = i break # If we found an unannotated video, jump to it if next_unannotated_index is not None: annotator.jump_to_video(next_unannotated_index) video_path = annotator.get_current_video() video_name = Path(video_path).name cur_data = get_cur_data(instruction_data, video_name) # Prepare default state for the new video return [ video_path, # video labeler_email, # labeler str(len(annotations)), # num_annotations None, # instruction_check (default value) cur_data["text_instruction"], # text_instruction *[False for _ in PHYSICAL_LAWS], # law checkboxes *[False for _ in COMMON_SENSE] # commonsense checkboxes ] else: # If all videos are annotated, stay at current video but update the interface current_video = annotator.get_current_video() if current_video: video_name = Path(current_video).name current_annotations = annotations.get( postprocess_name_for_gradio(video_name), { "labeler": labeler_email, "law_details": {law: False for law in PHYSICAL_LAWS}, "commonsense": {cs: False for cs in COMMON_SENSE}, "instruction": "3" } ) cur_data = get_cur_data(instruction_data, video_name) return [ current_video, labeler_email, str(len(annotations)), current_annotations["instruction"], cur_data["text_instruction"], *[current_annotations["law_details"].get(law, False) for law in PHYSICAL_LAWS], *[current_annotations["commonsense"].get(cs, False) for cs in COMMON_SENSE] ] else: # Fallback for empty video list return [ None, labeler_email, "0", None, "[system] No videos available", *[False for _ in PHYSICAL_LAWS], *[False for _ in COMMON_SENSE] ] def check_inputs(labeler_email, instruction_check): """Helper function to check input validity""" if not labeler_email: return False, "Please enter your email before proceeding." if not instruction_check: return False, "Please select whether the video follows the instruction before proceeding." return True, "" def confirm_callback(video_path, labeler_email, instruction_check, *checkbox_values): pass def skip_callback(video_path, labeler_email, instruction_check, *checkbox_values): ## save annotations with a flag skipped num_laws = len(PHYSICAL_LAWS) law_values = checkbox_values[:num_laws] commonsense_values = checkbox_values[num_laws:] breakpoint() save_current_annotation(video_path, labeler_email, instruction_check, law_values, commonsense_values, skipped=True) annotator.next_video() return update_video() def next_video_callback(video_path, labeler_email, instruction_check, *checkbox_values): breakpoint() # First check inputs is_valid, message = check_inputs(labeler_email, instruction_check) if not is_valid: # Return current state with error message gr.Warning(message) return update_video() # Split checkbox values into law and commonsense values num_laws = len(PHYSICAL_LAWS) law_values = checkbox_values[:num_laws] commonsense_values = checkbox_values[num_laws:] save_current_annotation(video_path, labeler_email, instruction_check, law_values, commonsense_values) annotator.next_video() return update_video() def prev_video_callback(video_path, labeler_email, instruction_check, *checkbox_values): # First check inputs is_valid, message = check_inputs(labeler_email, instruction_check) if not is_valid: # Return current state with error message gr.Warning(message) return update_video() # Split checkbox values into law and commonsense values num_laws = len(PHYSICAL_LAWS) law_values = checkbox_values[:num_laws] commonsense_values = checkbox_values[num_laws:] save_current_annotation(video_path, labeler_email, instruction_check, law_values, commonsense_values) annotator.prev_video() return update_video() with gr.Blocks() as interface: # gr.Markdown("# Video Annotation Interface") with gr.Row(): with gr.Column(scale=1): video = gr.Video(label="Current Video", format="mp4", height=450, width=800) with gr.Row(): with gr.Column(scale=2): labeler = gr.Textbox( label="Labeler ID (your email)", placeholder="Enter your email", interactive=True, ) with gr.Column(scale=1): num_annotations = gr.Textbox( label="Annotations Count", placeholder="0", interactive=False, ) text_instruction = gr.Textbox(label="Text prompt", interactive=False) instruction_check = gr.Radio( label="Task1: Does this video follow the instruction?", choices=[ "0: Not at all!!!", "1: Correct object, wrong motion (or vice versa).", "2: Follow instruction, fail task.", "3: Follow instruction, complete task." ], type="value", value="3" ) with gr.Row(): with gr.Column(scale=1): skip_btn = gr.Button("Skip! Video Corrupted") with gr.Column(scale=1): confirm_btn = gr.Button("Confirm!") with gr.Row(): with gr.Column(scale=1): prev_btn = gr.Button("Previous Video") with gr.Column(scale=1): next_btn = gr.Button("Next Video") load_btn = gr.Button("Load Annotations") with gr.Column(scale=1): gr.Markdown("Task2: [Based on your first impression] Select the major commonsense violations in the video: [multiple (0-2) choices]") commonsense_checkboxes = [] for cs in COMMON_SENSE: commonsense_checkboxes.append(gr.Checkbox(label=cs)) gr.Markdown("Task3: Please select all physics laws the video violates: [multiple (0-5) choices]") law_checkboxes = [] for i, law in enumerate(PHYSICAL_LAWS): checkbox = gr.Checkbox(label=law, interactive=True) law_checkboxes.append(checkbox) # if i < len(PHYSICAL_LAWS) - 1: # image_path = os.path.join(os.path.abspath(__file__).rsplit("/", 1)[0], list(EXAMPLE_IMAGES.values())[i]) if i != len(PHYSICAL_LAWS) - 1: image_path = list(EXAMPLE_IMAGES.values())[i] image = Image.open(image_path).convert("RGB") gr.Image(value=image, label=f"Example {i+1}", show_label=True, height=68, width=700) # Create a flat list of all inputs all_inputs = [video, labeler, instruction_check] + law_checkboxes + commonsense_checkboxes # Create a flat list of all outputs all_outputs = [video, labeler, num_annotations, instruction_check, text_instruction] + law_checkboxes + commonsense_checkboxes # Set up event handlers with flattened inputs and outputs skip_btn.click( skip_callback, inputs=all_inputs, outputs=all_outputs ) load_btn.click( load_anns_callback, inputs=[labeler], outputs=all_outputs ) next_btn.click( next_video_callback, inputs=all_inputs, outputs=all_outputs ) prev_btn.click( prev_video_callback, inputs=all_inputs, outputs=all_outputs ) interface.load( fn=update_video, inputs=None, outputs=all_outputs ) return interface if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Annotation") parser.add_argument("--domain", type=str, default="robotics", help="") parser.add_argument("--src", type=str, default="CogVideo-T2V", help="") # Parse the arguments args = parser.parse_args() domains = ["robotics", "humans", "general", "av", "game"] src = ["CogVideo-I2V", "CogVideo-T2V", "Open-Sora-I2V", "Open-Sora-T2V", "Pandora", "TurboT2V", "Open-Sora-Plan-I2V", "Open-Sora-Plan-T2V"] assert args.domain in domains, f"{args.domain} not in available domain." assert args.src in src, f"{args.src} not in available model src." instruction_base_path = "domains" src_video_map = { "CogVideo-I2V": "/home/yunhaof/workspace/datasets/outputs_v2", "CogVideo-T2V": "/home/yunhaof/workspace/datasets/outputs_v2", "Pandora": "/lustre/fsw/portfolios/nvr/users/dachengl/VILA-EWM/outputs", "Open-Sora-I2V": "/lustre/fsw/portfolios/nvr/users/dachengl/Open-Sora/outputs", "Open-Sora-T2V": "/lustre/fsw/portfolios/nvr/users/dachengl/Open-Sora/outputs", "TurboT2V": "", "Open-Sora-Plan-I2V": "/home/yunhaof/workspace/projects/Open-Sora-Plan/ewm_benchmark/gradio_videos", "Open-Sora-Plan-T2V": "/home/yunhaof/workspace/projects/Open-Sora-Plan/ewm_benchmark/gradio_videos" } # Adhoc solution to naming mismatch domain_name_map = { "humans": "humans", "game": "game", "general": "general", "av": "av", "robotics": "robotics" } cur_domain = domain_name_map[args.domain] # video_folder = "/lustre/fsw/portfolios/nvr/users/dachengl/CogVideo/outputs" video_folder = Path(src_video_map[args.src]) # print("Processing the 100 videos for the current annotation.") videos = [] if args.src == "CogVideo-I2V": for v in video_folder.glob("*.mp4"): if "t2v" not in v.stem and "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) elif args.src == "CogVideo-T2V": for v in video_folder.glob("*.mp4"): if "t2v" in v.stem and "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) elif args.src == "Pandora": for v in video_folder.glob("*.mp4"): if "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) elif args.src == "Open-Sora-I2V": for v in video_folder.glob("*.mp4"): if "t2v" not in v.stem and "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) elif args.src == "Open-Sora-T2V": for v in video_folder.glob("*.mp4"): if "t2v" in v.stem and "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) elif args.src == "Open-Sora-Plan-I2V": for v in video_folder.glob("*.mp4"): if "t2v" not in v.stem and "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) elif args.src == "Open-Sora-Plan-T2V": for v in video_folder.glob("*.mp4"): if "t2v" in v.stem and "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) elif args.src == "TurboT2V": for v in video_folder.glob("*.mp4"): if "t2v" in v.stem and "resized_" not in v.stem and f"{cur_domain}_" in v.stem: videos.append(v) videos = sorted(videos) print(f"Number of videos: {len(videos)}") instruction_file = f"domains/{args.domain}/dataset_v2/instruction_ewm.json" annotation_base = "annotations" os.makedirs(annotation_base, exist_ok=True) annotation_dir = os.path.join(annotation_base, f"{args.domain}_{args.src}") instruction_data = {} with open(instruction_file, "r") as f: instructions = json.load(f) for instruction in instructions: file_name = os.path.basename(instruction["video_path"]) # gradio will eliminate - file_name = postprocess_name_for_gradio(file_name)#.replace("-", "").replace("_t2v","") instruction_data[file_name] = instruction # perform a check that these videos will appear on the instruction, with or without the resized_ for _video in videos: try: _ = get_cur_data(instruction_data, postprocess_name_for_gradio(Path(_video).name))#.replace("-", "").replace("_t2v","")) except: print(f"parsing name {_video} fails, you may want to look at the name in instruction_ewm.json") assert False try: _ = get_cur_data(instruction_data, "resized_" + postprocess_name_for_gradio(Path(_video).name))# .replace("-", "").replace("_t2v","")) except: print(f"parsing name resized_{_video} fails, you may want to look at the name in instruction_ewm.json") assert False iface = create_interface(instruction_data, videos, annotation_dir) iface.launch(share=True, allowed_paths=[src_video_map[args.src]])