import subprocess from typing import List, Dict, Any from dataclasses import dataclass from abc import ABC, abstractmethod from PIL import Image from pathlib import Path import numpy as np import cv2 import clip import torch from torch import nn import torch.nn.functional as F from typing import Tuple import os import json from diffusers import CogVideoXPipeline from diffusers.utils import export_to_video from video_generation_evaluation.toolkit.fvd import get_dataset_features, I3DFeatureExtractor from numpy import cov from numpy import mean from scipy.linalg import sqrtm from video_generation_evaluation.evaluate import task2dimension class BaseTask(ABC): def __init__(self, task_data: str, model): self.task_data = task_data self.model = model self.data = self._parse_data(task_data) @abstractmethod def _parse_data(self, task_data: Dict[str, Any]): pass @abstractmethod def evaluate(self) -> Dict[str, float]: pass @abstractmethod def run_inference(self): pass class T2VTask(BaseTask): def _parse_result_file(self, output_dir: Path) -> float | None: for jsonfile in output_dir.iterdir(): if "eval" in jsonfile.name: with open(jsonfile.as_posix(), "r") as file: data = json.load(file) return float(data[self.taskname][0]) def _parse_data(self, task_data): with open(task_data, "r") as file: annos = json.load(file) taskname = annos["task"].replace(" ", "") self.taskname = taskname self.save_root = os.path.join("General-Bench", "Video-Generation", taskname) return annos["data"] def run_inference(self): for d in self.data: prompt = d["input"]["prompt"] for i in range(5): video = self.model(prompt, generator=torch.Generator(self.model.device).manual_seed(i)).frames[0] save_name = prompt + "-" + str(i) + ".mp4" save_path = os.path.join(self.save_root, save_name) export_to_video(video, save_path, fps=8) class FVDEval(T2VTask): def evaluate(self, real_video_root): model = I3DFeatureExtractor().cuda().eval() real_features = get_dataset_features(real_video_root, model) generated_features = get_dataset_features(self.save_root, model) mu_real = mean(real_features, axis=0) mu_generated = mean(generated_features, axis=0) sigma_real = cov(real_features, rowvar=False) sigma_generated = cov(generated_features, rowvar=False) diff = mu_real - mu_generated covmean, _ = sqrtm(sigma_real.dot(sigma_generated), disp=False) if np.iscomplexobj(covmean): covmean = covmean.real fvd = diff.dot(diff) + np.trace(sigma_real + sigma_generated - 2 * covmean) print(f"{self.taskname} score: {fvd}") return fvd class ThirdPartyEval(T2VTask): def evaluate(self): videos_path = Path(self.save_root).resolve() dimension = task2dimension[self.taskname] full_info = Path("./full_info_t2v.json").resolve() output_dir = Path("./evaluation_results").resolve() output_dir = output_dir.joinpath(self.taskname) output_dir.mkdir(parents=True, exist_ok=True) cmd = [ "python", "-W", "ignore", "evaluate.py", "--full_json_dir", str(full_info), "--videos_path", str(videos_path), "--dimension", dimension, "--output_path", str(output_dir) ] try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError as exc: raise RuntimeError(f"Evaluation failed: {exc}") from exc score = self._parse_result_file(Path(output_dir)) print(f"{self.taskname} score: {score}") return score class I2VTask(BaseTask): def _parse_result_file(self, output_dir: Path) -> float | None: score = 0 for jsonfile in output_dir.iterdir(): if "eval" in jsonfile.name: with open(jsonfile.as_posix(), "r") as file: data: dict = json.load(file) score += list(data.values())[0][0] return score def _parse_data(self, task_data): self.dirpath = os.path.dirname(task_data) with open(task_data, "r") as file: annos = json.load(file) taskname = annos["task"].replace(" ", "") self.taskname = taskname self.dimensions = ("subject_consistency", "overall_consistency", "motion_smoothness", "dynamic_degree") self.save_root = os.path.join("General-Bench", "Video-Generation", taskname) def run_inference(self): for d in self.data: prompt = d["input"]["prompt"] image = d["input"]["image"] image = os.path.join(self.dirpath, image) for i in range(5): video = self.model( prompt=prompt, image=image, generator=torch.Generator(self.model.device).manual_seed(i) ).frames[0] save_name = prompt + "-" + str(i) + ".mp4" save_path = os.path.join(self.save_root, save_name) export_to_video(video, save_path, fps=8) def evaluate(self): taskname = self.taskname full_info = Path("./full_info_i2v.json").resolve() output_dir = Path("./evaluation_results").resolve() output_dir = output_dir.joinpath(taskname) output_dir.mkdir(parents=True, exist_ok=True) for dimension in self.dimensions: cmd = [ "python", "-W", "ignore", "evaluate.py", "--full_json_dir", str(full_info), "--videos_path", str(self.save_root), "--dimension", dimension, "--output_path", str(output_dir) ] try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError as exc: raise RuntimeError(f"Evaluation failed: {exc}") from exc score = self._parse_result_file(Path(output_dir)) print(f"{self.taskname} score: {score}") return score class AthleticsT2V(FVDEval): pass class HumanT2V(FVDEval): pass class ConcertT2V(FVDEval): pass class TerrestrialAnimalT2V(FVDEval): pass class WaterSportsT2V(FVDEval): pass class ActionT2V(ThirdPartyEval): pass class ArtisticT2V(ThirdPartyEval): pass class BackgroundConsistency(ThirdPartyEval): pass class CameraMotionT2V(ThirdPartyEval): pass class ClassConditionedT2V(ThirdPartyEval): pass class ColorT2V(ThirdPartyEval): pass class DynamicT2V(ThirdPartyEval): pass class MaterialT2V(ThirdPartyEval): pass class MultiClassConditionedT2V(ThirdPartyEval): pass class SceneT2V(ThirdPartyEval): pass class SpatialRelationT2V(ThirdPartyEval): pass class StaticT2V(ThirdPartyEval): pass class StyleT2V(ThirdPartyEval): pass class ArchitectureI2V(I2VTask): pass class ClothI2V(I2VTask): pass class FoodI2V(I2VTask): pass class FurnitureI2V(I2VTask): pass class HumanI2V(I2VTask): pass class PetI2V(I2VTask): pass class PlantI2V(I2VTask): pass class SceneI2V(I2VTask): pass class VehicleI2V(I2VTask): pass class WeatherI2V(I2VTask): pass class WildAnimalI2V(I2VTask): pass if __name__ == "__main__": root = Path("General-Bench-Openset/video/generation") task_type = "T2V" model = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.bfloat16).to("cuda") task_files = [ "AthleticsT2V", "HumanT2V", "ConcertT2V", "TerrestrialAnimalT2V", "WaterSportsT2V", "ActionT2V", "ArtisticT2V", "BackgroundConsistency", "CameraMotionT2V", "ClassConditionedT2V", "ColorT2V", "DynamicT2V", "MaterialT2V", "MultiClassConditionedT2V", "SceneT2V", "SpatialRelationT2V", "StaticT2V", "StyleT2V", "ArchitectureI2V", "ClothI2V", "FoodI2V", "FurnitureI2V", "HumanI2V", "PetI2V", "PlantI2V", "SceneI2V", "VehicleI2V", "WeatherI2V", "WildAnimalI2V", ] task_files = [root.joinpath(task, "annotation.json") for task in task_files] for idx, file in enumerate(task_files): if file.exists(): continue with open(file.as_posix(), 'r', encoding='utf-8') as f: task_data = json.load(f) task_name = task_data["task"] print(f"Running evaluation for task {idx + 1}: {task_name}") TASK_MAPPING = { "AthleticsT2V": AthleticsT2V, "HumanT2V": HumanT2V, "ConcertT2V": ConcertT2V, "TerrestrialAnimalT2V": TerrestrialAnimalT2V, "WaterSportsT2V": WaterSportsT2V, "ActionT2V": ActionT2V, "ArtisticT2V": ArtisticT2V, "BackgroundConsistency": BackgroundConsistency, "CameraMotionT2V": CameraMotionT2V, "ClassConditionedT2V": ClassConditionedT2V, "ColorT2V": ColorT2V, "DynamicT2V": DynamicT2V, "MaterialT2V": MaterialT2V, "MultiClassConditionedT2V": MultiClassConditionedT2V, "SceneT2V": SceneT2V, "SpatialRelationT2V": SpatialRelationT2V, "StaticT2V": StaticT2V, "StyleT2V": StyleT2V, "ArchitectureI2V": ArchitectureI2V, "ClothI2V": ClothI2V, "FoodI2V": FoodI2V, "FurnitureI2V": FurnitureI2V, "HumanI2V": HumanI2V, "PetI2V": PetI2V, "PlantI2V": PlantI2V, "SceneI2V": SceneI2V, "VehicleI2V": VehicleI2V, "WeatherI2V": WeatherI2V, "WildAnimalI2V": WildAnimalI2V, } clean_task_name = task_name.replace(" ", "") task_class = TASK_MAPPING.get(clean_task_name) if task_class is None: raise NotImplementedError elif task_type not in clean_task_name: continue else: task = task_class(file.as_posix(), model) task.run_inference() metrics = task.evaluate() print("Task name: ", task_name, "Task type: ", task_type, "Evaluation results:", metrics)