jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -104,6 +104,209 @@ class GenerationConfig:
         return self
 class EndpointHandler:
     """Handles video generation requests using LTX models and Varnish post-processing"""

         return self
+class EndpointHandler:
+    """Handles video generation requests using LTX models and Varnish post-processing"""
+    def __init__(self, model_path: str = ""):
+        """Initialize the handler with LTX models and Varnish
+        Args:
+            model_path: Path to LTX model weights
+        """
+        # Enable TF32 for potential speedup on Ampere GPUs
+        #torch.backends.cuda.matmul.allow_tf32 = True
+        # Initialize models with bfloat16 precision
+        self.text_to_video = LTXPipeline.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16
+        ).to("cuda")
+        self.image_to_video = LTXImageToVideoPipeline.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16
+        ).to("cuda")
+        # Enable CPU offload for memory efficiency
+        #self.text_to_video.enable_model_cpu_offload()
+        #self.image_to_video.enable_model_cpu_offload()
+        # Initialize Varnish for post-processing
+        self.varnish = Varnish(
+            device="cuda" if torch.cuda.is_available() else "cpu",
+            output_format="mp4",
+            output_codec="h264",
+            output_quality=23,
+            enable_mmaudio=False,
+            #model_base_dir=os.path.abspath(os.path.join(os.getcwd(), "varnish"))
+            model_base_dir="/repository/varnish",
+        )
+    async def process_frames(
+        self,
+        frames: torch.Tensor,
+        config: GenerationConfig
+    ) -> tuple[str, dict]:
+        """Post-process generated frames using Varnish
+        Args:
+            frames: Generated video frames tensor
+            config: Generation configuration
+        Returns:
+            Tuple of (video data URI, metadata dictionary)
+        """
+        try:
+            logger.info(f"Original frames shape: {frames.shape}")
+            # Remove batch dimension if present
+            if len(frames.shape) == 5:
+                frames = frames.squeeze(0)  # Remove batch dimension
+            logger.info(f"Processed frames shape: {frames.shape}")
+            # Process video with Varnish
+            result = await self.varnish(
+                input_data=frames,
+                input_fps=config.fps,
+                output_fps=config.fps,
+                upscale_factor=config.upscale_factor if config.upscale_factor > 1 else None,
+                enable_interpolation=config.enable_interpolation
+            )
+            # Convert to data URI
+            video_uri = await result.write(
+                output_type="data-uri",
+                output_format="mp4",
+                output_codec="h264",
+                output_quality=23
+            )
+            # Collect metadata
+            metadata = {
+                "width": result.metadata.width,
+                "height": result.metadata.height,
+                "num_frames": result.metadata.frame_count,
+                "fps": result.metadata.fps,
+                "duration": result.metadata.duration,
+                "num_inference_steps": config.num_inference_steps,
+                "seed": config.seed,
+                "upscale_factor": config.upscale_factor,
+                "interpolation_enabled": config.enable_interpolation
+            }
+            return video_uri, metadata
+        except Exception as e:
+            logger.error(f"Error in process_frames: {str(e)}")
+            raise RuntimeError(f"Failed to process frames: {str(e)}")
+    from dataclasses import dataclass
+from pathlib import Path
+import pathlib
+from typing import Dict, Any, Optional, Tuple
+import asyncio
+import base64
+import io
+import pprint
+import logging
+import random
+import traceback
+import os
+import numpy as np
+import torch
+from diffusers import LTXPipeline, LTXImageToVideoPipeline
+from PIL import Image
+from varnish import Varnish
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Constraints
+MAX_WIDTH = 1280
+MAX_HEIGHT = 720
+MAX_FRAMES = 257
+# this is only a temporary solution (famous last words)
+def apply_dirty_hack_to_patch_file_extensions_and_bypass_filter(directory):
+    """
+    Recursively rename all '.wut' files to '.pth' in the given directory
+    Args:
+        directory (str): Path to the directory to process
+    """
+    # Convert the directory path to absolute path
+    directory = os.path.abspath(directory)
+    # Walk through directory and its subdirectories
+    for root, _, files in os.walk(directory):
+        for filename in files:
+            if filename.endswith('.wut'):
+                # Get full path of the file
+                old_path = os.path.join(root, filename)
+                # Create new filename by replacing the extension
+                new_filename = filename.replace('.wut', '.pth')
+                new_path = os.path.join(root, new_filename)
+                try:
+                    os.rename(old_path, new_path)
+                    print(f"Renamed: {old_path} -> {new_path}")
+                except OSError as e:
+                    print(f"Error renaming {old_path}: {e}")
+def print_directory_structure(startpath):
+    """Print the directory structure starting from the given path."""
+    for root, dirs, files in os.walk(startpath):
+        level = root.replace(startpath, '').count(os.sep)
+        indent = ' ' * 4 * level
+        logger.info(f"{indent}{os.path.basename(root)}/")
+        subindent = ' ' * 4 * (level + 1)
+        for f in files:
+            logger.info(f"{subindent}{f}")
+logger.info("💡 Applying a dirty hack (patch ""/repository"" to fix file extensions):")
+apply_dirty_hack_to_patch_file_extensions_and_bypass_filter("/repository")
+logger.info("💡 Printing directory structure of ""/repository"":")
+print_directory_structure("/repository")
+@dataclass
+class GenerationConfig:
+    """Configuration for video generation"""
+    width: int = 768
+    height: int = 512
+    fps: int = 24
+    duration_sec: float = 4.0
+    num_inference_steps: int = 30
+    guidance_scale: float = 7.5
+    upscale_factor: float = 2.0
+    enable_interpolation: bool = False
+    seed: int = -1  # -1 means random seed
+    @property
+    def num_frames(self) -> int:
+        """Calculate number of frames based on fps and duration"""
+        return int(self.duration_sec * self.fps) + 1
+    def validate_and_adjust(self) -> 'GenerationConfig':
+        """Validate and adjust parameters to meet constraints"""
+        # Round dimensions to nearest multiple of 32
+        self.width = max(32, min(MAX_WIDTH, round(self.width / 32) * 32))
+        self.height = max(32, min(MAX_HEIGHT, round(self.height / 32) * 32))
+        # Adjust number of frames to be in format 8k + 1
+        k = (self.num_frames - 1) // 8
+        num_frames = min((k * 8) + 1, MAX_FRAMES)
+        self.duration_sec = (num_frames - 1) / self.fps
+        # Set random seed if not specified
+        if self.seed == -1:
+            self.seed = random.randint(0, 2**32 - 1)
+        return self
 class EndpointHandler:
     """Handles video generation requests using LTX models and Varnish post-processing"""