jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -98,6 +98,11 @@ class GenerationConfig:
     grain_amount: float = 0.0
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters to meet constraints"""
         # Round dimensions to nearest multiple of 32
@@ -148,7 +153,6 @@ class EndpointHandler:
             output_format="mp4",
             output_codec="h264",
             output_quality=17,
-            enable_mmaudio=False,
             model_base_dir="/repository/varnish",
         )
@@ -167,14 +171,6 @@ class EndpointHandler:
             Tuple of (video data URI, metadata dictionary)
         """
         try:
-            logger.info(f"Original frames shape: {frames.shape}")
-            # Remove batch dimension if present
-            if len(frames.shape) == 5:
-                frames = frames.squeeze(0)  # Remove batch dimension
-            logger.info(f"Processed frames shape: {frames.shape}")
             # Process video with Varnish
             result = await self.varnish(
                 input_data=frames, # note: this might contain a certain number of frames eg. 97, which will get doubled if double_num_frames is True
@@ -182,6 +178,9 @@ class EndpointHandler:
                 double_num_frames=config.double_num_frames, # if True, the number of frames will be multiplied by 2 using RIFE
                 super_resolution=config.grain_amount_config, # if True, the resolution will be multiplied by 2 using Real_ESRGAN
                 grain_amount_config.grain_amount,
             )
             # Convert to data URI
@@ -228,6 +227,9 @@ class EndpointHandler:
                     - double_num_frames (optional, bool): if enabled, the number of frames will be multiplied by 2 using RIFE
                     - super_resolution (optional, bool): if enabled, the resolution will be multiplied by 2 using Real_ESRGAN
                     - grain_amount (optional, float): amount of film grain to add to the output video
         Returns:
             Dictionary containing:
                 - video: Base64 encoded MP4 data URI
@@ -270,6 +272,9 @@ class EndpointHandler:
             double_num_frames=params.get("double_num_frames", GenerationConfig.double_num_frames), # if True, the number of frames will be multiplied by 2 using RIFE
             super_resolution=params.get("super_resolution", GenerationConfig.super_resolution), # if True, the resolution will be multiplied by 2 using Real_ESRGAN
             grain_amount=params.get("grain_amount", GenerationConfig.grain_amount),
         ).validate_and_adjust()
         logger.info(f"Global request settings:")
@@ -316,33 +321,15 @@ class EndpointHandler:
                     frames = self.image_to_video(**generation_kwargs).frames
                 else:
                     frames = self.text_to_video(**generation_kwargs).frames
-                # Log original shape
-                logger.info(f"Original frames shape: {frames.shape}")
-                # Remove batch dimension if present
-                if len(frames.shape) == 5:
-                    frames = frames.squeeze(0)  # Remove batch dimension
-                logger.info(f"Processed frames shape: {frames.shape}")
-                # Ensure we have the correct shape
-                if len(frames.shape) != 4:
-                    raise ValueError(f"Expected tensor of shape [frames, channels, height, width], got shape {frames.shape}")
-                # Post-process frames
                 try:
                     loop = asyncio.get_event_loop()
                 except RuntimeError:
                     loop = asyncio.new_event_loop()
                     asyncio.set_event_loop(loop)
-                video_uri, metadata = loop.run_until_complete(
-                    self.process_frames(frames, config)
-                )
                 return {
                     "video": video_uri,
                     "content-type": "video/mp4",

     grain_amount: float = 0.0
+    # audio settings
+    enable_audio: bool = False  # Whether to generate audio
+    audio_prompt: str = ""  # Text prompt for audio generation
+    audio_negative_prompt: str = "voices, voice, talking, speaking, speech" # Negative prompt for audio generation
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters to meet constraints"""
         # Round dimensions to nearest multiple of 32
             output_format="mp4",
             output_codec="h264",
             output_quality=17,
             model_base_dir="/repository/varnish",
         )
             Tuple of (video data URI, metadata dictionary)
         """
         try:
             # Process video with Varnish
             result = await self.varnish(
                 input_data=frames, # note: this might contain a certain number of frames eg. 97, which will get doubled if double_num_frames is True
                 double_num_frames=config.double_num_frames, # if True, the number of frames will be multiplied by 2 using RIFE
                 super_resolution=config.grain_amount_config, # if True, the resolution will be multiplied by 2 using Real_ESRGAN
                 grain_amount_config.grain_amount,
+                enable_audio=config.enable_audio,
+                audio_prompt=config.audio_prompt,
+                audio_negative_prompt=config.audio_negative_prompt,
             )
             # Convert to data URI
                     - double_num_frames (optional, bool): if enabled, the number of frames will be multiplied by 2 using RIFE
                     - super_resolution (optional, bool): if enabled, the resolution will be multiplied by 2 using Real_ESRGAN
                     - grain_amount (optional, float): amount of film grain to add to the output video
+                    - enable_audio (optional, bool): automatically generate an audio track
+                    - audio_prompt (optional, str): prompt to use for the audio generation (concepts to add)
+                    - audio_negative_prompt (optional, str): nehative prompt to use for the audio generation (concepts to ignore)
         Returns:
             Dictionary containing:
                 - video: Base64 encoded MP4 data URI
             double_num_frames=params.get("double_num_frames", GenerationConfig.double_num_frames), # if True, the number of frames will be multiplied by 2 using RIFE
             super_resolution=params.get("super_resolution", GenerationConfig.super_resolution), # if True, the resolution will be multiplied by 2 using Real_ESRGAN
             grain_amount=params.get("grain_amount", GenerationConfig.grain_amount),
+            enable_audio=params.get("enable_audio", GenerationConfig.enable_audio),
+            audio_prompt=params.get("audio_prompt", GenerationConfig.audio_prompt),
+            audio_negative_prompt=params.get("audio_negative_prompt", GenerationConfig.audio_negative_prompt),
         ).validate_and_adjust()
         logger.info(f"Global request settings:")
                     frames = self.image_to_video(**generation_kwargs).frames
                 else:
                     frames = self.text_to_video(**generation_kwargs).frames
                 try:
                     loop = asyncio.get_event_loop()
                 except RuntimeError:
                     loop = asyncio.new_event_loop()
                     asyncio.set_event_loop(loop)
+                video_uri, metadata = loop.run_until_complete(self.process_frames(frames, config))
                 return {
                     "video": video_uri,
                     "content-type": "video/mp4",