File size: 9,600 Bytes
be2df75
 
 
 
132e8c4
 
be2df75
b5d7f4c
a5265d3
6789b6e
1a6f91c
be2df75
 
 
2fa2e84
 
 
e349e43
 
 
 
132e8c4
be2df75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85f39ae
132e8c4
be2df75
d35cde0
be2df75
 
 
132e8c4
be2df75
132e8c4
be2df75
 
d35cde0
be2df75
132e8c4
be2df75
132e8c4
 
 
 
be2df75
132e8c4
 
 
be2df75
 
 
ef15707
6789b6e
 
 
be2df75
f6dd4f3
 
 
 
 
6789b6e
 
f6dd4f3
 
be2df75
 
 
 
 
 
ef15707
 
be2df75
 
ef15707
 
be2df75
ef15707
f6dd4f3
 
 
be2df75
 
 
 
f6dd4f3
1a6f91c
be2df75
 
f6dd4f3
 
 
 
 
1a6f91c
be2df75
f6dd4f3
 
 
 
 
be2df75
 
 
 
 
f6dd4f3
 
be2df75
132e8c4
 
be2df75
132e8c4
 
be2df75
 
 
 
 
 
 
 
 
 
 
 
132e8c4
be2df75
 
 
 
132e8c4
be2df75
 
132e8c4
d35cde0
132e8c4
be2df75
 
 
 
 
 
 
 
 
 
 
 
e349e43
132e8c4
1a6f91c
be2df75
 
 
 
b5d7f4c
be2df75
ef15707
 
be2df75
 
 
 
 
b5d7f4c
 
ef15707
 
be2df75
ef15707
1a6f91c
be2df75
29cace5
 
1a6f91c
 
ef15707
f6dd4f3
1a6f91c
f6dd4f3
0d0a1bc
 
 
 
 
 
 
 
 
 
 
 
 
132e8c4
be2df75
6941871
be2df75
6941871
be2df75
 
 
 
b97329f
a5265d3
be2df75
 
d35cde0
1a6f91c
be2df75
f6dd4f3
 
1a6f91c
132e8c4
 
a5265d3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Any, Optional, Tuple
import asyncio
import base64
import io
import logging
import random
import traceback
import os
import numpy as np
import torch
from diffusers import LTXPipeline, LTXImageToVideoPipeline
from PIL import Image

from varnish import Varnish
from varnish.debug_utils import setup_debug_logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constraints
MAX_WIDTH = 1280
MAX_HEIGHT = 720
MAX_FRAMES = 257

@dataclass
class GenerationConfig:
    """Configuration for video generation"""
    width: int = 768
    height: int = 512
    fps: int = 24
    duration_sec: float = 4.0
    num_inference_steps: int = 30
    guidance_scale: float = 7.5
    upscale_factor: float = 2.0
    enable_interpolation: bool = False
    seed: int = -1  # -1 means random seed

    @property
    def num_frames(self) -> int:
        """Calculate number of frames based on fps and duration"""
        return int(self.duration_sec * self.fps) + 1

    def validate_and_adjust(self) -> 'GenerationConfig':
        """Validate and adjust parameters to meet constraints"""
        # Round dimensions to nearest multiple of 32
        self.width = max(32, min(MAX_WIDTH, round(self.width / 32) * 32))
        self.height = max(32, min(MAX_HEIGHT, round(self.height / 32) * 32))
        
        # Adjust number of frames to be in format 8k + 1
        k = (self.num_frames - 1) // 8
        num_frames = min((k * 8) + 1, MAX_FRAMES)
        self.duration_sec = (num_frames - 1) / self.fps

        # Set random seed if not specified
        if self.seed == -1:
            self.seed = random.randint(0, 2**32 - 1)

        return self

class EndpointHandler:
    """Handles video generation requests using LTX models and Varnish post-processing"""
    
    def __init__(self, model_path: str = ""):
        """Initialize the handler with LTX models and Varnish

        Args:
            model_path: Path to LTX model weights
        """
        # Enable TF32 for potential speedup on Ampere GPUs
        #torch.backends.cuda.matmul.allow_tf32 = True
        
        # Initialize models with bfloat16 precision
        self.text_to_video = LTXPipeline.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16
        ).to("cuda")
        
        self.image_to_video = LTXImageToVideoPipeline.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16
        ).to("cuda")

        # Enable CPU offload for memory efficiency
        #self.text_to_video.enable_model_cpu_offload()
        #self.image_to_video.enable_model_cpu_offload()

        # temporary enable this if you have some issues with locating the model files
        setup_debug_logging()

        # Initialize Varnish for post-processing
        self.varnish = Varnish(
            device="cuda" if torch.cuda.is_available() else "cpu",
            output_format="mp4",
            output_codec="h264",
            output_quality=23,
            enable_mmaudio=False,
            #model_base_dir=os.path.abspath(os.path.join(os.getcwd(), "varnish"))
        )

    async def process_frames(
        self,
        frames: torch.Tensor,
        config: GenerationConfig
    ) -> tuple[str, dict]:
        """Post-process generated frames using Varnish
        
        Args:
            frames: Generated video frames tensor
            config: Generation configuration
            
        Returns:
            Tuple of (video data URI, metadata dictionary)
        """
        # Process video with Varnish
        result = await self.varnish(
            input_data=frames,
            input_fps=config.fps,
            upscale_factor=config.upscale_factor if config.upscale_factor > 1 else None,
            enable_interpolation=config.enable_interpolation,
            output_fps=config.fps
        )
        
        # Convert to data URI
        video_uri = await result.write(
            output_type="data-uri",
            output_format="mp4",
            output_codec="h264",
            output_quality=23
        )
        
        # Collect metadata
        metadata = {
            "width": result.metadata.width,
            "height": result.metadata.height,
            "num_frames": result.metadata.frame_count,
            "fps": result.metadata.fps,
            "duration": result.metadata.duration,
            "num_inference_steps": config.num_inference_steps,
            "seed": config.seed,
            "upscale_factor": config.upscale_factor,
            "interpolation_enabled": config.enable_interpolation
        }
        
        return video_uri, metadata

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Process incoming requests for video generation
        
        Args:
            data: Request data containing:
                - inputs (str): Text prompt or image
                - width (optional): Video width
                - height (optional): Video height
                - fps (optional): Frames per second
                - duration_sec (optional): Video duration
                - num_inference_steps (optional): Inference steps
                - guidance_scale (optional): Guidance scale
                - upscale_factor (optional): Upscaling factor
                - enable_interpolation (optional): Enable frame interpolation
                - seed (optional): Random seed
                
        Returns:
            Dictionary containing:
                - video: Base64 encoded MP4 data URI
                - content-type: MIME type
                - metadata: Generation metadata
        """
        # Extract prompt
        prompt = data.get("inputs")
        if not prompt:
            raise ValueError("No prompt provided in the 'inputs' field")

        # Create and validate configuration
        config = GenerationConfig(
            width=data.get("width", GenerationConfig.width),
            height=data.get("height", GenerationConfig.height),
            fps=data.get("fps", GenerationConfig.fps),
            duration_sec=data.get("duration_sec", GenerationConfig.duration_sec),
            num_inference_steps=data.get("num_inference_steps", GenerationConfig.num_inference_steps),
            guidance_scale=data.get("guidance_scale", GenerationConfig.guidance_scale),
            upscale_factor=data.get("upscale_factor", GenerationConfig.upscale_factor),
            enable_interpolation=data.get("enable_interpolation", GenerationConfig.enable_interpolation),
            seed=data.get("seed", GenerationConfig.seed)
        ).validate_and_adjust()

        try:
            with torch.no_grad():
                # Set random seeds
                random.seed(config.seed)
                np.random.seed(config.seed)
                generator = torch.manual_seed(config.seed)
                
                # Prepare generation parameters
                generation_kwargs = {
                    "prompt": prompt,
                    "height": config.height,
                    "width": config.width,
                    "num_frames": config.num_frames,
                    "guidance_scale": config.guidance_scale,
                    "num_inference_steps": config.num_inference_steps,
                    "output_type": "pt",
                    "generator": generator
                }

                # Check if image-to-video generation is requested
                image_data = data.get("image")
                if image_data:
                    # Process base64 image
                    if image_data.startswith('data:'):
                        image_data = image_data.split(',', 1)[1]
                    image_bytes = base64.b64decode(image_data)
                    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                    generation_kwargs["image"] = image
                    frames = self.image_to_video(**generation_kwargs).frames
                else:
                    frames = self.text_to_video(**generation_kwargs).frames
                
                # Log original shape
                logger.info(f"Original frames shape: {frames.shape}")
                
                # Remove batch dimension if present
                if len(frames.shape) == 5:
                    frames = frames.squeeze(0)  # Remove batch dimension
                
                logger.info(f"Processed frames shape: {frames.shape}")
                
                # Ensure we have the correct shape
                if len(frames.shape) != 4:
                    raise ValueError(f"Expected tensor of shape [frames, channels, height, width], got shape {frames.shape}")

                # Post-process frames
                
                loop = asyncio.new_event_loop()
                
                try:
                    video_uri, metadata = loop.run_until_complete(
                        self.process_frames(frames, config)
                    )
                except Exception as e:
                    raise RuntimeError(f"Failed to convert the frames to a video, because {str(e)}")
                finally:
                    loop.close()
                
                return {
                    "video": video_uri,
                    "content-type": "video/mp4",
                    "metadata": metadata
                }

        except Exception as e:
            message = f"Error generating video ({str(e)})\n{traceback.format_exc()}"
            print(message)
            raise RuntimeError(message)