Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 17

Commit

dfc94bc

1 Parent(s): ac45732

upgrade finetrainers + freeze datasets

Browse files

Files changed (4) hide show

finetrainers/data/dataset.py +6 -56
finetrainers/trainer/sft_trainer/trainer.py +16 -21
requirements.txt +4 -0
requirements_without_flash_attention.txt +4 -0

finetrainers/data/dataset.py CHANGED Viewed

@@ -970,59 +970,9 @@ def _preprocess_image(image: PIL.Image.Image) -> torch.Tensor:
     image = image.permute(2, 0, 1).contiguous() / 127.5 - 1.0
     return image
-def _preprocess_video(video) -> torch.Tensor:
-    import torch
-    import numpy as np
-    # For decord VideoReader
-    if hasattr(video, 'get_batch') and 'decord' in str(type(video)):
-        video = video.get_batch(list(range(len(video))))
-        video = video.permute(0, 3, 1, 2).contiguous() / 127.5 - 1.0
-        return video
-    # For torchvision VideoReader
-    elif 'torchvision.io.video_reader' in str(type(video)):
-        # Use the correct iteration pattern for torchvision.io.VideoReader
-        frames = []
-        try:
-            # First seek to the beginning
-            video.seek(0)
-            # Then collect frames by iterating
-            for _ in range(30):  # Try to get a reasonable number of frames
-                try:
-                    frame_dict = next(video)
-                    frame = frame_dict["data"]  # Extract the tensor data from the dict
-                    frames.append(frame)
-                except StopIteration:
-                    break
-        except Exception as e:
-            print(f"Error iterating VideoReader: {e}")
-        if frames:
-            # In torchvision.io.VideoReader, frames are already in [C, H, W] format
-            # We need to stack and convert to [B, C, H, W]
-            stacked_frames = torch.stack(frames)
-            # Normalize to [-1, 1]
-            stacked_frames = stacked_frames.float() / 127.5 - 1.0
-            return stacked_frames
-        # If we couldn't get frames, create a dummy tensor
-        print("Failed to get frames, creating dummy tensor")
-        return torch.zeros(16, 3, 512, 768).float()
-    # For list of PIL images
-    elif isinstance(video, list) and len(video) > 0 and hasattr(video[0], 'convert'):
-        frames = []
-        for img in video:
-            img_tensor = torch.from_numpy(np.array(img.convert("RGB"))).float()
-            frames.append(img_tensor)
-        video = torch.stack(frames)
-        video = video.permute(0, 3, 1, 2).contiguous() / 127.5 - 1.0
-        return video
-    # Unknown type
-    else:
-        print(f"Unknown video type: {type(video)}")
-        return torch.zeros(16, 3, 512, 768).float()

     image = image.permute(2, 0, 1).contiguous() / 127.5 - 1.0
     return image
+def _preprocess_video(video: decord.VideoReader) -> torch.Tensor:
+    video = video.get_batch(list(range(len(video))))
+    video = video.permute(0, 3, 1, 2).contiguous()
+    video = video.float() / 127.5 - 1.0
+    return video

finetrainers/trainer/sft_trainer/trainer.py CHANGED Viewed

@@ -694,13 +694,14 @@ class SFTTrainer:
         # 3. Cleanup & log artifacts
         parallel_backend.wait_for_everyone()
         # Remove all hooks that might have been added during pipeline initialization to the models
         pipeline.remove_all_hooks()
         del pipeline
-        utils.free_memory()
-        memory_statistics = utils.get_memory_statistics()
-        logger.info(f"Memory after validation end: {json.dumps(memory_statistics, indent=4)}")
         torch.cuda.reset_peak_memory_stats(parallel_backend.device)
         # Gather artifacts from all processes. We also need to flatten them since each process returns a list of artifacts.
@@ -788,7 +789,7 @@ class SFTTrainer:
     def _init_trackers(self) -> None:
         # TODO(aryan): handle multiple trackers
-        trackers = ["wandb"]
         experiment_name = self.args.tracker_name or "finetrainers-experiment"
         self.state.parallel_backend.initialize_trackers(
             trackers, experiment_name=experiment_name, config=self._get_training_info(), log_dir=self.args.logging_dir
@@ -836,7 +837,6 @@ class SFTTrainer:
         utils.synchronize_device()
     def _init_pipeline(self, final_validation: bool = False) -> DiffusionPipeline:
-        parallel_backend = self.state.parallel_backend
         module_names = ["text_encoder", "text_encoder_2", "text_encoder_3", "transformer", "vae"]
         if not final_validation:
@@ -871,7 +871,6 @@ class SFTTrainer:
                 enable_tiling=self.args.enable_tiling,
                 enable_model_cpu_offload=self.args.enable_model_cpu_offload,
                 training=False,
-                device=parallel_backend.device,
             )
             # Load the LoRA weights if performing LoRA finetuning
@@ -880,7 +879,8 @@ class SFTTrainer:
         components = {module_name: getattr(pipeline, module_name, None) for module_name in module_names}
         self._set_components(components)
-        self._move_components_to_device(list(components.values()))
         return pipeline
     def _prepare_data(
@@ -923,17 +923,12 @@ class SFTTrainer:
         else:
             logger.info("Precomputed condition & latent data exhausted. Loading & preprocessing new data.")
-            # TODO(aryan): This needs to be revisited. For some reason, the tests did not detect that self.transformer
-            # had become None after this but should have been loaded back from the checkpoint.
-            # parallel_backend = self.state.parallel_backend
-            # train_state = self.state.train_state
-            # self.checkpointer.save(
-            #     train_state.step,
-            #     force=True,
-            #     _device=parallel_backend.device,
-            #     _is_main_process=parallel_backend.is_main_process,
-            # )
-            # self._delete_components(component_names=["transformer", "unet"])
             if self.args.precomputation_once:
                 consume_fn = preprocessor.consume_once
@@ -974,8 +969,8 @@ class SFTTrainer:
             self._delete_components(component_names)
             del latent_components, component_names, component_modules
-            # self.checkpointer.load()
-            # self.transformer = self.checkpointer.states["model"].model[0]
         return condition_iterator, latent_iterator

         # 3. Cleanup & log artifacts
         parallel_backend.wait_for_everyone()
+        memory_statistics = utils.get_memory_statistics()
+        logger.info(f"Memory after validation end: {json.dumps(memory_statistics, indent=4)}")
         # Remove all hooks that might have been added during pipeline initialization to the models
+        module_names = ["text_encoder", "text_encoder_2", "text_encoder_3", "vae"]
         pipeline.remove_all_hooks()
         del pipeline
+        self._delete_components(module_names)
         torch.cuda.reset_peak_memory_stats(parallel_backend.device)
         # Gather artifacts from all processes. We also need to flatten them since each process returns a list of artifacts.
     def _init_trackers(self) -> None:
         # TODO(aryan): handle multiple trackers
+        trackers = [self.args.report_to]
         experiment_name = self.args.tracker_name or "finetrainers-experiment"
         self.state.parallel_backend.initialize_trackers(
             trackers, experiment_name=experiment_name, config=self._get_training_info(), log_dir=self.args.logging_dir
         utils.synchronize_device()
     def _init_pipeline(self, final_validation: bool = False) -> DiffusionPipeline:
         module_names = ["text_encoder", "text_encoder_2", "text_encoder_3", "transformer", "vae"]
         if not final_validation:
                 enable_tiling=self.args.enable_tiling,
                 enable_model_cpu_offload=self.args.enable_model_cpu_offload,
                 training=False,
             )
             # Load the LoRA weights if performing LoRA finetuning
         components = {module_name: getattr(pipeline, module_name, None) for module_name in module_names}
         self._set_components(components)
+        if not self.args.enable_model_cpu_offload:
+            self._move_components_to_device(list(components.values()))
         return pipeline
     def _prepare_data(
         else:
             logger.info("Precomputed condition & latent data exhausted. Loading & preprocessing new data.")
+            parallel_backend = self.state.parallel_backend
+            if parallel_backend.world_size == 1:
+                self._move_components_to_device([self.transformer], "cpu")
+                utils.free_memory()
+                utils.synchronize_device()
+                torch.cuda.reset_peak_memory_stats(parallel_backend.device)
             if self.args.precomputation_once:
                 consume_fn = preprocessor.consume_once
             self._delete_components(component_names)
             del latent_components, component_names, component_modules
+            if parallel_backend.world_size == 1:
+                self._move_components_to_device([self.transformer])
         return condition_iterator, latent_iterator

requirements.txt CHANGED Viewed

@@ -7,6 +7,10 @@ torch==2.5.1
 torchvision==0.20.1
 torchao==0.6.1
 huggingface_hub
 hf_transfer>=0.1.8
 diffusers @ git+https://github.com/huggingface/diffusers.git@main

 torchvision==0.20.1
 torchao==0.6.1
+# datasets 3.4.0 replaces decord by torchvision
+# let's free it for now
+datasets==3.3.2
 huggingface_hub
 hf_transfer>=0.1.8
 diffusers @ git+https://github.com/huggingface/diffusers.git@main

requirements_without_flash_attention.txt CHANGED Viewed

@@ -8,6 +8,10 @@ torch==2.5.1
 torchvision==0.20.1
 torchao==0.6.1
 huggingface_hub
 hf_transfer>=0.1.8
 diffusers @ git+https://github.com/huggingface/diffusers.git@main

 torchvision==0.20.1
 torchao==0.6.1
+# datasets 3.4.0 replaces decord by torchvision
+# let's free it for now
+datasets==3.3.2
 huggingface_hub
 hf_transfer>=0.1.8
 diffusers @ git+https://github.com/huggingface/diffusers.git@main