jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
 # Constraints
 MAX_LARGE_SIDE = 1280
-MAX_SMALL_SIDE = 720
 MAX_FRAMES = (8 * 21) + 1 # visual glitches appear after about 169 frames, so we cap it
 # this is only a temporary solution (famous last words)
@@ -78,15 +78,16 @@ class GenerationConfig:
     negative_prompt: str = "saturated, overlit, worst quality, inconsistent motion, blurry, jittery, distorted, cropped, watermarked, watermark, logo, subtitle, subtitles, lowres"
     # video model settings (will be used during generation of the initial raw video clip)
-    width: int = 768 # max is 1280 but we use a lower value
-    height: int = 416 # max is 720 but we use a lower value
     # users may tend to always set this to the max, to get as much useable content as possible (which is MAX_FRAMES ie. 257).
     # The value must be a multiple of 8, plus 1 frame.
     # visual glitches appear after about 169 frames, so we don't need more actually
     num_frames: int = (8 * 14) + 1
-    guidance_scale: float = 4.0
     num_inference_steps: int = 30
     # reproducible generation settings
@@ -116,12 +117,12 @@ class GenerationConfig:
             total_pixels = self.width * self.height
             if total_pixels > MAX_TOTAL_PIXELS:
                 scale = (MAX_TOTAL_PIXELS / total_pixels) ** 0.5
-                self.width = max(128, min(MAX_LARGE_SIDE, round(self.width * scale / 16) * 16))
-                self.height = max(128, min(MAX_LARGE_SIDE, round(self.height * scale / 16) * 16))
             else:
-                # Round dimensions to nearest multiple of 16
-                self.width = max(128, min(MAX_LARGE_SIDE, round(self.width / 16) * 16))
-                self.height = max(128, min(MAX_LARGE_SIDE, round(self.height / 16) * 16))
         # Adjust number of frames to be in format 8k + 1
         k = (self.num_frames - 1) // 8

 # Constraints
 MAX_LARGE_SIDE = 1280
+MAX_SMALL_SIDE = 768 # should be 720 but it must be divisible by 32
 MAX_FRAMES = (8 * 21) + 1 # visual glitches appear after about 169 frames, so we cap it
 # this is only a temporary solution (famous last words)
     negative_prompt: str = "saturated, overlit, worst quality, inconsistent motion, blurry, jittery, distorted, cropped, watermarked, watermark, logo, subtitle, subtitles, lowres"
     # video model settings (will be used during generation of the initial raw video clip)
+    # we use small values to make things a bit faster
+    width: int = 768
+    height: int = 416
     # users may tend to always set this to the max, to get as much useable content as possible (which is MAX_FRAMES ie. 257).
     # The value must be a multiple of 8, plus 1 frame.
     # visual glitches appear after about 169 frames, so we don't need more actually
     num_frames: int = (8 * 14) + 1
+    guidance_scale: float = 5.0
     num_inference_steps: int = 30
     # reproducible generation settings
             total_pixels = self.width * self.height
             if total_pixels > MAX_TOTAL_PIXELS:
                 scale = (MAX_TOTAL_PIXELS / total_pixels) ** 0.5
+                self.width = max(128, min(MAX_LARGE_SIDE, round(self.width * scale / 32) * 32))
+                self.height = max(128, min(MAX_LARGE_SIDE, round(self.height * scale / 32) * 32))
             else:
+                # Round dimensions to nearest multiple of 32
+                self.width = max(128, min(MAX_LARGE_SIDE, round(self.width / 32) * 32))
+                self.height = max(128, min(MAX_LARGE_SIDE, round(self.height / 32) * 32))
         # Adjust number of frames to be in format 8k + 1
         k = (self.num_frames - 1) // 8