Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke commited on Mar 10

Commit

57737a0

1 Parent(s): 02e94ba

upgrade Finetrainers

Browse files

Files changed (4) hide show

finetrainers/models/cogvideox/base_specification.py +1 -1
finetrainers/models/ltx_video/base_specification.py +4 -5
finetrainers/models/wan/base_specification.py +31 -7
finetrainers/trainer/sft_trainer/trainer.py +5 -2

finetrainers/models/cogvideox/base_specification.py CHANGED Viewed

@@ -299,7 +299,7 @@ class CogVideoXModelSpecification(ModelSpecification):
             latents = posterior.sample(generator=generator)
             del posterior
-        if not self.vae_config.invert_scale_latents:
             latents = latents * self.vae_config.scaling_factor
         if patch_size_t is not None:

             latents = posterior.sample(generator=generator)
             del posterior
+        if not getattr(self.vae_config, "invert_scale_latents", False):
             latents = latents * self.vae_config.scaling_factor
         if patch_size_t is not None:

finetrainers/models/ltx_video/base_specification.py CHANGED Viewed

@@ -336,8 +336,8 @@ class LTXVideoModelSpecification(ModelSpecification):
         latents = self._pack_latents(latents, patch_size, patch_size_t)
         noise = self._pack_latents(noise, patch_size, patch_size_t)
         noisy_latents = self._pack_latents(noisy_latents, patch_size, patch_size_t)
         sigmas = sigmas.view(-1, 1, 1).expand(-1, *noisy_latents.shape[1:-1], -1)
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
@@ -352,7 +352,6 @@ class LTXVideoModelSpecification(ModelSpecification):
             vae_spatial_compression_ratio,
             vae_spatial_compression_ratio,
         ]
-        timesteps = (sigmas * 1000.0).long()
         pred = transformer(
             **latent_model_conditions,
@@ -444,9 +443,9 @@ class LTXVideoModelSpecification(ModelSpecification):
         latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
     ) -> torch.Tensor:
         # Normalize latents across the channel dimension [B, C, F, H, W]
-        latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
-        latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
-        latents = (latents - latents_mean) * scaling_factor / latents_std
         return latents
     @staticmethod

         latents = self._pack_latents(latents, patch_size, patch_size_t)
         noise = self._pack_latents(noise, patch_size, patch_size_t)
         noisy_latents = self._pack_latents(noisy_latents, patch_size, patch_size_t)
         sigmas = sigmas.view(-1, 1, 1).expand(-1, *noisy_latents.shape[1:-1], -1)
+        timesteps = (sigmas * 1000.0).long()
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
             vae_spatial_compression_ratio,
             vae_spatial_compression_ratio,
         ]
         pred = transformer(
             **latent_model_conditions,
         latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
     ) -> torch.Tensor:
         # Normalize latents across the channel dimension [B, C, F, H, W]
+        latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(device=latents.device)
+        latents_std = latents_std.view(1, -1, 1, 1, 1).to(device=latents.device)
+        latents = ((latents.float() - latents_mean) * scaling_factor / latents_std).to(latents)
         return latents
     @staticmethod

finetrainers/models/wan/base_specification.py CHANGED Viewed

@@ -39,7 +39,7 @@ class WanLatentEncodeProcessor(ProcessorMixin):
     def __init__(self, output_names: List[str]):
         super().__init__()
         self.output_names = output_names
-        assert len(self.output_names) == 1
     def forward(
         self,
@@ -72,7 +72,10 @@ class WanLatentEncodeProcessor(ProcessorMixin):
             moments = vae._encode(video)
             latents = moments.to(dtype=dtype)
-        return {self.output_names[0]: latents}
 class WanModelSpecification(ModelSpecification):
@@ -108,7 +111,7 @@ class WanModelSpecification(ModelSpecification):
         if condition_model_processors is None:
             condition_model_processors = [T5Processor(["encoder_hidden_states", "prompt_attention_mask"])]
         if latent_model_processors is None:
-            latent_model_processors = [WanLatentEncodeProcessor(["latents"])]
         self.condition_model_processors = condition_model_processors
         self.latent_model_processors = latent_model_processors
@@ -266,7 +269,10 @@ class WanModelSpecification(ModelSpecification):
             "image": image,
             "video": video,
             "generator": generator,
-            "compute_posterior": compute_posterior,
             **kwargs,
         }
         input_keys = set(conditions.keys())
@@ -284,20 +290,29 @@ class WanModelSpecification(ModelSpecification):
         compute_posterior: bool = True,
         **kwargs,
     ) -> Tuple[torch.Tensor, ...]:
         if compute_posterior:
             latents = latent_model_conditions.pop("latents")
         else:
-            posterior = DiagonalGaussianDistribution(latent_model_conditions.pop("latents"))
             latents = posterior.sample(generator=generator)
             del posterior
         noise = torch.zeros_like(latents).normal_(generator=generator)
         noisy_latents = FF.flow_match_xt(latents, noise, sigmas)
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
-        timesteps = (sigmas.flatten() * 1000.0).long()
         pred = transformer(
             **latent_model_conditions,
             **condition_model_conditions,
@@ -367,3 +382,12 @@ class WanModelSpecification(ModelSpecification):
             transformer_copy.save_pretrained(os.path.join(directory, "transformer"))
         if scheduler is not None:
             scheduler.save_pretrained(os.path.join(directory, "scheduler"))

     def __init__(self, output_names: List[str]):
         super().__init__()
         self.output_names = output_names
+        assert len(self.output_names) == 3
     def forward(
         self,
             moments = vae._encode(video)
             latents = moments.to(dtype=dtype)
+        latents_mean = torch.tensor(vae.config.latents_mean)
+        latents_std = 1.0 / torch.tensor(vae.config.latents_std)
+        return {self.output_names[0]: latents, self.output_names[1]: latents_mean, self.output_names[2]: latents_std}
 class WanModelSpecification(ModelSpecification):
         if condition_model_processors is None:
             condition_model_processors = [T5Processor(["encoder_hidden_states", "prompt_attention_mask"])]
         if latent_model_processors is None:
+            latent_model_processors = [WanLatentEncodeProcessor(["latents", "latents_mean", "latents_std"])]
         self.condition_model_processors = condition_model_processors
         self.latent_model_processors = latent_model_processors
             "image": image,
             "video": video,
             "generator": generator,
+            # We must force this to False because the latent normalization should be done before
+            # the posterior is computed. The VAE does not handle this any more:
+            # https://github.com/huggingface/diffusers/pull/10998
+            "compute_posterior": False,
             **kwargs,
         }
         input_keys = set(conditions.keys())
         compute_posterior: bool = True,
         **kwargs,
     ) -> Tuple[torch.Tensor, ...]:
+        compute_posterior = False  # See explanation in prepare_latents
         if compute_posterior:
             latents = latent_model_conditions.pop("latents")
         else:
+            latents = latent_model_conditions.pop("latents")
+            latents_mean = latent_model_conditions.pop("latents_mean")
+            latents_std = latent_model_conditions.pop("latents_std")
+            mu, logvar = torch.chunk(latents, 2, dim=1)
+            mu = self._normalize_latents(mu, latents_mean, latents_std)
+            logvar = self._normalize_latents(logvar, latents_mean, latents_std)
+            latents = torch.cat([mu, logvar], dim=1)
+            posterior = DiagonalGaussianDistribution(latents)
             latents = posterior.sample(generator=generator)
             del posterior
         noise = torch.zeros_like(latents).normal_(generator=generator)
         noisy_latents = FF.flow_match_xt(latents, noise, sigmas)
+        timesteps = (sigmas.flatten() * 1000.0).long()
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
         pred = transformer(
             **latent_model_conditions,
             **condition_model_conditions,
             transformer_copy.save_pretrained(os.path.join(directory, "transformer"))
         if scheduler is not None:
             scheduler.save_pretrained(os.path.join(directory, "scheduler"))
+    @staticmethod
+    def _normalize_latents(
+        latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor
+    ) -> torch.Tensor:
+        latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(device=latents.device)
+        latents_std = latents_std.view(1, -1, 1, 1, 1).to(device=latents.device)
+        latents = ((latents.float() - latents_mean) * latents_std).to(latents)
+        return latents

finetrainers/trainer/sft_trainer/trainer.py CHANGED Viewed

@@ -147,8 +147,11 @@ class SFTTrainer:
         # Make sure the trainable params are in float32 if data sharding is not enabled. For FSDP, we need all
         # parameters to be of the same dtype.
-        if self.args.training_type == TrainingType.LORA and not parallel_backend.data_sharding_enabled:
-            cast_training_params([self.transformer], dtype=torch.float32)
     def _prepare_for_training(self) -> None:
         # 1. Apply parallelism

         # Make sure the trainable params are in float32 if data sharding is not enabled. For FSDP, we need all
         # parameters to be of the same dtype.
+        if parallel_backend.data_sharding_enabled:
+            self.transformer.to(dtype=self.args.transformer_dtype)
+        else:
+            if self.args.training_type == TrainingType.LORA:
+                cast_training_params([self.transformer], dtype=torch.float32)
     def _prepare_for_training(self) -> None:
         # 1. Apply parallelism