jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -1,75 +1,90 @@
-from typing import Dict, Any, Union, Optional, Tuple
-import torch
-from diffusers import LTXPipeline, LTXImageToVideoPipeline
-from PIL import Image
 import base64
 import io
-import tempfile
 import random
 import numpy as np
-from moviepy.editor import ImageSequenceClip
-import os
-import logging
-import asyncio
 from varnish import Varnish
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-ENABLE_CPU_OFFLOAD = True
-EXPERIMENTAL_STUFF = False
-random.seed(0)
-np.random.seed(0)
-generator = torch.manual_seed(0)
-# you can notice we don't use device=cuda, for more info see:
-# https://huggingface.co/docs/diffusers/v0.16.0/en/using-diffusers/reproducibility#gpu
-varnish = Varnish(
-    enable_mmaudio=False,
-    #mmaudio_config=mmaudio_config
-)
 class EndpointHandler:
-    # Default configuration
-    DEFAULT_FPS = 24
-    DEFAULT_DURATION = 4  # seconds
-    DEFAULT_NUM_FRAMES = (DEFAULT_DURATION * DEFAULT_FPS) + 1  # 97 frames
-    DEFAULT_NUM_STEPS = 25
-    DEFAULT_WIDTH = 768
-    DEFAULT_HEIGHT = 512
-    # Constraints
-    MAX_WIDTH = 1280
-    MAX_HEIGHT = 720
-    MAX_FRAMES = 257
-    def __init__(self, path: str = ""):
-        """Initialize the LTX Video handler with both text-to-video and image-to-video pipelines.
         Args:
-            path (str): Path to the model weights directory
         """
-        if EXPERIMENTAL_STUFF:
-            torch.backends.cuda.matmul.allow_tf32 = True
-        # Load both pipelines with bfloat16 precision as recommended in docs
         self.text_to_video = LTXPipeline.from_pretrained(
-            path,
             torch_dtype=torch.bfloat16
         ).to("cuda")
         self.image_to_video = LTXImageToVideoPipeline.from_pretrained(
-            path,
             torch_dtype=torch.bfloat16
         ).to("cuda")
-        if ENABLE_CPU_OFFLOAD:
-            self.text_to_video.enable_model_cpu_offload()
-            self.image_to_video.enable_model_cpu_offload()
         self.varnish = Varnish(
             device="cuda" if torch.cuda.is_available() else "cpu",
             output_format="mp4",
@@ -78,172 +93,115 @@ class EndpointHandler:
             enable_mmaudio=False
         )
-    def _validate_and_adjust_resolution(self, width: int, height: int) -> Tuple[int, int]:
-        """Validate and adjust resolution to meet constraints.
-        Args:
-            width (int): Requested width
-            height (int): Requested height
-        Returns:
-            Tuple[int, int]: Adjusted (width, height)
-        """
-        # Round to nearest multiple of 32
-        width = round(width / 32) * 32
-        height = round(height / 32) * 32
-        # Enforce maximum dimensions
-        width = min(width, self.MAX_WIDTH)
-        height = min(height, self.MAX_HEIGHT)
-        # Enforce minimum dimensions
-        width = max(width, 32)
-        height = max(height, 32)
-        return width, height
-    def _validate_and_adjust_frames(self, num_frames: Optional[int] = None, fps: Optional[int] = None) -> Tuple[int, int]:
-        """Validate and adjust frame count and FPS to meet constraints.
         Args:
-            num_frames (Optional[int]): Requested number of frames
-            fps (Optional[int]): Requested frames per second
         Returns:
-            Tuple[int, int]: Adjusted (num_frames, fps)
         """
-        # Use defaults if not provided
-        fps = fps or self.DEFAULT_FPS
-        num_frames = num_frames or self.DEFAULT_NUM_FRAMES
-        # Adjust frames to be in format 8k + 1
-        k = (num_frames - 1) // 8
-        num_frames = (k * 8) + 1
-        # Enforce maximum frame count
-        num_frames = min(num_frames, self.MAX_FRAMES)
-        return num_frames, fps
-    async def process_and_encode_video(
-        self,
-        frames: torch.Tensor,
-        fps: int,
-        upscale_factor: int = 0,
-        enable_interpolation: bool = False,
-        interpolation_exp: int = 1
-    ) -> tuple[str, dict]:
-        """Process video frames using Varnish and return base64 encoded result"""
         # Process video with Varnish
         result = await self.varnish(
             input_data=frames,
-            input_fps=fps,
-            output_fps=fps,
-            enable_upscale=upscale_factor > 1,
-            upscale_factor=upscale_factor,
-            enable_interpolation=enable_interpolation,
-            interpolation_exp=interpolation_exp
         )
-        # Get video as data URI
-        video_data_uri = await result.write(
             output_type="data-uri",
             output_format="mp4",
             output_codec="h264",
             output_quality=23
         )
         metadata = {
             "width": result.metadata.width,
             "height": result.metadata.height,
             "num_frames": result.metadata.frame_count,
             "fps": result.metadata.fps,
-            "duration": result.metadata.duration
         }
-        return video_data_uri, metadata
-    def _run_async(self, frames: torch.Tensor, fps: int, upscale_factor: int, enable_interpolation: bool, interpolation_exp: int) -> Dict[str, Any]:
-        """Run asynchronous video processing in a synchronous context"""
-        loop = asyncio.new_event_loop()
-        try:
-            return loop.run_until_complete(
-                self.process_and_encode_video(
-                    frames=frames,
-                    fps=fps,
-                    upscale_factor=upscale_factor,
-                    enable_interpolation=enable_interpolation,
-                    interpolation_exp=interpolation_exp
-                )
-            )
-        finally:
-            loop.close()
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """Process the input data and generate video using LTX.
         Args:
-            data (Dict[str, Any]): Input data containing:
-                - prompt (str): Text description for video generation
-                - image (Optional[str]): Base64 encoded image for image-to-video generation
-                - width (Optional[int]): Video width (default: 768)
-                - height (Optional[int]): Video height (default: 512)
-                - num_frames (Optional[int]): Number of frames (default: 97)
-                - fps (Optional[int]): Frames per second (default: 24)
-                - num_inference_steps (Optional[int]): Number of inference steps (default: 25)
-                - guidance_scale (Optional[float]): Guidance scale (default: 7.5)
         Returns:
-            Dict[str, Any]: Dictionary containing:
-                - video: video encoded in Base64 (h.264 MP4 video). This is a data-uri (prefixed with "data:").
-                - content-type: MIME type of the video (right now always "video/mp4")
-                - metadata: Dictionary with actual values used for generation
         """
-        prompt = data.get("inputs", None)
         if not prompt:
             raise ValueError("No prompt provided in the 'inputs' field")
-        # Get generation parameters
-        width = data.get("width", self.DEFAULT_WIDTH)
-        height = data.get("height", self.DEFAULT_HEIGHT)
-        width, height = self._validate_and_adjust_resolution(width, height)
-        num_frames = data.get("num_frames", self.DEFAULT_NUM_FRAMES)
-        fps = data.get("fps", self.DEFAULT_FPS)
-        num_frames, fps = self._validate_and_adjust_frames(num_frames, fps)
-        # Get post-processing parameters
-        upscale_factor = data.get("upscale_factor", 0)
-        enable_interpolation = data.get("enable_interpolation", False)
-        interpolation_exp = data.get("interpolation_exp", 1)
-        guidance_scale = data.get("guidance_scale", 7.5)
-        num_inference_steps = data.get("num_inference_steps", self.DEFAULT_NUM_STEPS)
-        seed = data.get("seed", -1)
-        seed = random.randint(0, 2**32 - 1) if seed == -1 else int(seed)
         try:
             with torch.no_grad():
-                random.seed(seed)
-                np.random.seed(seed)
-                generator.manual_seed(seed)
                 generation_kwargs = {
                     "prompt": prompt,
-                    "height": height,
-                    "width": width,
-                    "num_frames": num_frames,
-                    "guidance_scale": guidance_scale,
-                    "num_inference_steps": num_inference_steps,
                     "output_type": "pt",
                     "generator": generator
                 }
-                # Generate frames using appropriate pipeline
                 image_data = data.get("image")
                 if image_data:
                     if image_data.startswith('data:'):
                         image_data = image_data.split(',', 1)[1]
                     image_bytes = base64.b64decode(image_data)
@@ -253,26 +211,17 @@ class EndpointHandler:
                 else:
                     frames = self.text_to_video(**generation_kwargs).frames
-                # Process and encode video
-                video_data_uri, metadata = self._run_async(
-                    frames=frames,
-                    fps=fps,
-                    upscale_factor=upscale_factor,
-                    enable_interpolation=enable_interpolation,
-                    interpolation_exp=interpolation_exp
-                )
-                # Add generation metadata
-                metadata.update({
-                    "num_inference_steps": num_inference_steps,
-                    "seed": seed,
-                    "upscale_factor": upscale_factor,
-                    "interpolation_enabled": enable_interpolation,
-                    "interpolation_exp": interpolation_exp
-                })
                 return {
-                    "video": video_data_uri,
                     "content-type": "video/mp4",
                     "metadata": metadata
                 }

+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple
+import asyncio
 import base64
 import io
+import logging
 import random
 import numpy as np
+import torch
+from diffusers import LTXPipeline, LTXImageToVideoPipeline
+from PIL import Image
 from varnish import Varnish
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Constraints
+MAX_WIDTH = 1280
+MAX_HEIGHT = 720
+MAX_FRAMES = 257
+@dataclass
+class GenerationConfig:
+    """Configuration for video generation"""
+    width: int = 768
+    height: int = 512
+    fps: int = 24
+    duration_sec: float = 4.0
+    num_inference_steps: int = 30
+    guidance_scale: float = 7.5
+    upscale_factor: float = 2.0
+    enable_interpolation: bool = False
+    seed: int = -1  # -1 means random seed
+    @property
+    def num_frames(self) -> int:
+        """Calculate number of frames based on fps and duration"""
+        return int(self.duration_sec * self.fps) + 1
+    def validate_and_adjust(self) -> 'GenerationConfig':
+        """Validate and adjust parameters to meet constraints"""
+        # Round dimensions to nearest multiple of 32
+        self.width = max(32, min(MAX_WIDTH, round(self.width / 32) * 32))
+        self.height = max(32, min(MAX_HEIGHT, round(self.height / 32) * 32))
+        # Adjust number of frames to be in format 8k + 1
+        k = (self.num_frames - 1) // 8
+        num_frames = min((k * 8) + 1, MAX_FRAMES)
+        self.duration_sec = (num_frames - 1) / self.fps
+        # Set random seed if not specified
+        if self.seed == -1:
+            self.seed = random.randint(0, 2**32 - 1)
+        return self
 class EndpointHandler:
+    """Handles video generation requests using LTX models and Varnish post-processing"""
+    def __init__(self, model_path: str = ""):
+        """Initialize the handler with LTX models and Varnish
         Args:
+            model_path: Path to LTX model weights
         """
+        # Enable TF32 for potential speedup on Ampere GPUs
+        #torch.backends.cuda.matmul.allow_tf32 = True
+        # Initialize models with bfloat16 precision
         self.text_to_video = LTXPipeline.from_pretrained(
+            model_path,
             torch_dtype=torch.bfloat16
         ).to("cuda")
         self.image_to_video = LTXImageToVideoPipeline.from_pretrained(
+            model_path,
             torch_dtype=torch.bfloat16
         ).to("cuda")
+        # Enable CPU offload for memory efficiency
+        #self.text_to_video.enable_model_cpu_offload()
+        #self.image_to_video.enable_model_cpu_offload()
+        # Initialize Varnish for post-processing
         self.varnish = Varnish(
             device="cuda" if torch.cuda.is_available() else "cpu",
             output_format="mp4",
             enable_mmaudio=False
         )
+    async def process_frames(
+        self,
+        frames: torch.Tensor,
+        config: GenerationConfig
+    ) -> tuple[str, dict]:
+        """Post-process generated frames using Varnish
         Args:
+            frames: Generated video frames tensor
+            config: Generation configuration
         Returns:
+            Tuple of (video data URI, metadata dictionary)
         """
         # Process video with Varnish
         result = await self.varnish(
             input_data=frames,
+            input_fps=config.fps,
+            upscale_factor=config.upscale_factor if config.upscale_factor > 1 else None,
+            enable_interpolation=config.enable_interpolation,
+            output_fps=config.fps
         )
+        # Convert to data URI
+        video_uri = await result.write(
             output_type="data-uri",
             output_format="mp4",
             output_codec="h264",
             output_quality=23
         )
+        # Collect metadata
         metadata = {
             "width": result.metadata.width,
             "height": result.metadata.height,
             "num_frames": result.metadata.frame_count,
             "fps": result.metadata.fps,
+            "duration": result.metadata.duration,
+            "num_inference_steps": config.num_inference_steps,
+            "seed": config.seed,
+            "upscale_factor": config.upscale_factor,
+            "interpolation_enabled": config.enable_interpolation
         }
+        return video_uri, metadata
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Process incoming requests for video generation
         Args:
+            data: Request data containing:
+                - inputs (str): Text prompt or image
+                - width (optional): Video width
+                - height (optional): Video height
+                - fps (optional): Frames per second
+                - duration_sec (optional): Video duration
+                - num_inference_steps (optional): Inference steps
+                - guidance_scale (optional): Guidance scale
+                - upscale_factor (optional): Upscaling factor
+                - enable_interpolation (optional): Enable frame interpolation
+                - seed (optional): Random seed
         Returns:
+            Dictionary containing:
+                - video: Base64 encoded MP4 data URI
+                - content-type: MIME type
+                - metadata: Generation metadata
         """
+        # Extract prompt
+        prompt = data.get("inputs")
         if not prompt:
             raise ValueError("No prompt provided in the 'inputs' field")
+        # Create and validate configuration
+        config = GenerationConfig(
+            width=data.get("width", GenerationConfig.width),
+            height=data.get("height", GenerationConfig.height),
+            fps=data.get("fps", GenerationConfig.fps),
+            duration_sec=data.get("duration_sec", GenerationConfig.duration_sec),
+            num_inference_steps=data.get("num_inference_steps", GenerationConfig.num_inference_steps),
+            guidance_scale=data.get("guidance_scale", GenerationConfig.guidance_scale),
+            upscale_factor=data.get("upscale_factor", GenerationConfig.upscale_factor),
+            enable_interpolation=data.get("enable_interpolation", GenerationConfig.enable_interpolation),
+            seed=data.get("seed", GenerationConfig.seed)
+        ).validate_and_adjust()
         try:
             with torch.no_grad():
+                # Set random seeds
+                random.seed(config.seed)
+                np.random.seed(config.seed)
+                generator = torch.manual_seed(config.seed)
+                # Prepare generation parameters
                 generation_kwargs = {
                     "prompt": prompt,
+                    "height": config.height,
+                    "width": config.width,
+                    "num_frames": config.num_frames,
+                    "guidance_scale": config.guidance_scale,
+                    "num_inference_steps": config.num_inference_steps,
                     "output_type": "pt",
                     "generator": generator
                 }
+                # Check if image-to-video generation is requested
                 image_data = data.get("image")
                 if image_data:
+                    # Process base64 image
                     if image_data.startswith('data:'):
                         image_data = image_data.split(',', 1)[1]
                     image_bytes = base64.b64decode(image_data)
                 else:
                     frames = self.text_to_video(**generation_kwargs).frames
+                # Post-process frames
+                loop = asyncio.new_event_loop()
+                try:
+                    video_uri, metadata = loop.run_until_complete(
+                        self.process_frames(frames, config)
+                    )
+                finally:
+                    loop.close()
                 return {
+                    "video": video_uri,
                     "content-type": "video/mp4",
                     "metadata": metadata
                 }