jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -4,6 +4,10 @@ from diffusers import LTXPipeline, LTXImageToVideoPipeline
 from PIL import Image
 import base64
 import io
 class EndpointHandler:
     def __init__(self, path: str = ""):
@@ -26,6 +30,50 @@ class EndpointHandler:
         # Enable memory optimizations
         self.text_to_video.enable_model_cpu_offload()
         self.image_to_video.enable_model_cpu_offload()
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process the input data and generate video using LTX.
@@ -35,12 +83,14 @@ class EndpointHandler:
                 - prompt (str): Text description for video generation
                 - image (Optional[str]): Base64 encoded image for image-to-video generation
                 - num_frames (Optional[int]): Number of frames to generate (default: 24)
                 - guidance_scale (Optional[float]): Guidance scale (default: 7.5)
                 - num_inference_steps (Optional[int]): Number of inference steps (default: 50)
         Returns:
             Dict[str, Any]: Dictionary containing:
-                - frames: List of base64 encoded frames
         """
         # Extract parameters
         prompt = data.get("prompt")
@@ -49,6 +99,7 @@ class EndpointHandler:
         # Get optional parameters with defaults
         num_frames = data.get("num_frames", 24)
         guidance_scale = data.get("guidance_scale", 7.5)
         num_inference_steps = data.get("num_inference_steps", 50)
@@ -56,37 +107,41 @@ class EndpointHandler:
         image_data = data.get("image")
         try:
-            if image_data:
-                # Decode base64 image
-                image_bytes = base64.b64decode(image_data)
-                image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-                # Generate video from image
-                output = self.image_to_video(
-                    prompt=prompt,
-                    image=image,
-                    num_frames=num_frames,
-                    guidance_scale=guidance_scale,
-                    num_inference_steps=num_inference_steps
-                )
-            else:
-                # Generate video from text only
-                output = self.text_to_video(
-                    prompt=prompt,
-                    num_frames=num_frames,
-                    guidance_scale=guidance_scale,
-                    num_inference_steps=num_inference_steps
-                )
-            # Convert frames to base64
-            frames = []
-            for frame in output.frames[0]:  # First element contains the frames
-                buffer = io.BytesIO()
-                frame.save(buffer, format="PNG")
-                frame_base64 = base64.b64encode(buffer.getvalue()).decode()
-                frames.append(frame_base64)
-            return {"frames": frames}
         except Exception as e:
-            raise RuntimeError(f"Error generating video: {str(e)}")

 from PIL import Image
 import base64
 import io
+import tempfile
+import numpy as np
+from moviepy.editor import ImageSequenceClip
+import os
 class EndpointHandler:
     def __init__(self, path: str = ""):
         # Enable memory optimizations
         self.text_to_video.enable_model_cpu_offload()
         self.image_to_video.enable_model_cpu_offload()
+        # Set default FPS
+        self.fps = 24
+    def _create_video_file(self, images: torch.Tensor, fps: int = 24) -> bytes:
+        """Convert frames to an MP4 video file.
+        Args:
+            images (torch.Tensor): Generated frames tensor
+            fps (int): Frames per second for the output video
+        Returns:
+            bytes: MP4 video file content
+        """
+        # Convert tensor to numpy array
+        video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
+        video_np = (video_np * 255).astype(np.uint8)
+        # Get dimensions
+        height, width = video_np.shape[1:3]
+        # Create temporary file
+        output_path = tempfile.mktemp(suffix=".mp4")
+        try:
+            # Create video clip and write to file
+            clip = ImageSequenceClip(list(video_np), fps=fps)
+            resized = clip.resize((width, height))
+            resized.write_videofile(output_path, codec="libx264", audio=False)
+            # Read the video file
+            with open(output_path, "rb") as f:
+                video_content = f.read()
+            return video_content
+        finally:
+            # Cleanup
+            if os.path.exists(output_path):
+                os.remove(output_path)
+            # Clear memory
+            del video_np
+            torch.cuda.empty_cache()
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process the input data and generate video using LTX.
                 - prompt (str): Text description for video generation
                 - image (Optional[str]): Base64 encoded image for image-to-video generation
                 - num_frames (Optional[int]): Number of frames to generate (default: 24)
+                - fps (Optional[int]): Frames per second (default: 24)
                 - guidance_scale (Optional[float]): Guidance scale (default: 7.5)
                 - num_inference_steps (Optional[int]): Number of inference steps (default: 50)
         Returns:
             Dict[str, Any]: Dictionary containing:
+                - video: Base64 encoded MP4 video
+                - content-type: MIME type of the video
         """
         # Extract parameters
         prompt = data.get("prompt")
         # Get optional parameters with defaults
         num_frames = data.get("num_frames", 24)
+        fps = data.get("fps", self.fps)
         guidance_scale = data.get("guidance_scale", 7.5)
         num_inference_steps = data.get("num_inference_steps", 50)
         image_data = data.get("image")
         try:
+            with torch.no_grad():
+                if image_data:
+                    # Decode base64 image
+                    image_bytes = base64.b64decode(image_data)
+                    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+                    # Generate video from image
+                    output = self.image_to_video(
+                        prompt=prompt,
+                        image=image,
+                        num_frames=num_frames,
+                        guidance_scale=guidance_scale,
+                        num_inference_steps=num_inference_steps,
+                        output_type="pt"
+                    ).images
+                else:
+                    # Generate video from text only
+                    output = self.text_to_video(
+                        prompt=prompt,
+                        num_frames=num_frames,
+                        guidance_scale=guidance_scale,
+                        num_inference_steps=num_inference_steps,
+                        output_type="pt"
+                    ).images
+                # Convert frames to video file
+                video_content = self._create_video_file(output, fps=fps)
+                # Encode video to base64
+                video_base64 = base64.b64encode(video_content).decode('utf-8')
+                return {
+                    "video": video_base64,
+                    "content-type": "video/mp4"
+                }
         except Exception as e:
+            raise RuntimeError(f"Error generating video: {str(e)}")