Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 1

Commit

d78dede

1 Parent(s): 64a70c0

investigate bugs in Finetrainers

Browse files

Files changed (3) hide show

finetrainers/dataset.py +165 -66
finetrainers/trainer.py +28 -0
training/cogvideox/dataset.py +2 -2

finetrainers/dataset.py CHANGED Viewed

@@ -15,6 +15,9 @@ from torchvision import transforms
 from torchvision.transforms import InterpolationMode
 from torchvision.transforms.functional import resize
 # Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
 # Very few bug reports but it happens. Look in decord Github issues for more relevant information.
@@ -30,6 +33,22 @@ from .constants import (  # noqa
 )
 logger = get_logger(__name__)
@@ -229,20 +248,48 @@ class ImageOrVideoDataset(Dataset):
         return image
     def _preprocess_video(self, path: Path) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        r"""
         Loads a single video, or latent and prompt embedding, based on initialization parameters.
         Returns a [F, C, H, W] video tensor.
         """
-        video_reader = decord.VideoReader(uri=path.as_posix())
-        video_num_frames = len(video_reader)
-        indices = list(range(0, video_num_frames, video_num_frames // self.max_num_frames))
-        frames = video_reader.get_batch(indices)
-        frames = frames[: self.max_num_frames].float()
-        frames = frames.permute(0, 3, 1, 2).contiguous()
-        frames = torch.stack([self.video_transforms(frame) for frame in frames], dim=0)
-        return frames
 class ImageOrVideoDatasetWithResizing(ImageOrVideoDataset):
@@ -264,35 +311,60 @@ class ImageOrVideoDatasetWithResizing(ImageOrVideoDataset):
         return image
     def _preprocess_video(self, path: Path) -> torch.Tensor:
-        video_reader = decord.VideoReader(uri=path.as_posix())
-        video_num_frames = len(video_reader)
-        #print(f"ImageOrVideoDatasetWithResizing: self.resolution_buckets = ", self.resolution_buckets)
-        #print(f"ImageOrVideoDatasetWithResizing: self.max_num_frames = ", self.max_num_frames)
-        #print(f"ImageOrVideoDatasetWithResizing: video_num_frames = ", video_num_frames)
-        video_buckets = [bucket for bucket in self.resolution_buckets if bucket[0] <= video_num_frames]
-        if not video_buckets:
-            _, h, w = self.resolution_buckets[0]
-            video_buckets = [(1, h, w)]
-        nearest_frame_bucket = min(
-            video_buckets,
-            key=lambda x: abs(x[0] - min(video_num_frames, self.max_num_frames)),
-            default=video_buckets[0],
-        )[0]
-        frame_indices = list(range(0, video_num_frames, video_num_frames // nearest_frame_bucket))
-        frames = video_reader.get_batch(frame_indices)
-        frames = frames[:nearest_frame_bucket].float()
-        frames = frames.permute(0, 3, 1, 2).contiguous()
-        nearest_res = self._find_nearest_resolution(frames.shape[2], frames.shape[3])
-        frames_resized = torch.stack([resize(frame, nearest_res) for frame in frames], dim=0)
-        frames = torch.stack([self.video_transforms(frame) for frame in frames_resized], dim=0)
-        return frames
     def _find_nearest_resolution(self, height, width):
         nearest_res = min(self.resolution_buckets, key=lambda x: abs(x[1] - height) + abs(x[2] - width))
@@ -338,35 +410,62 @@ class ImageOrVideoDatasetWithResizeAndRectangleCrop(ImageOrVideoDataset):
         return arr
     def _preprocess_video(self, path: Path) -> torch.Tensor:
-        video_reader = decord.VideoReader(uri=path.as_posix())
-        video_num_frames = len(video_reader)
-        print(f"ImageOrVideoDatasetWithResizeAndRectangleCrop: self.resolution_buckets = ", self.resolution_buckets)
-        print(f"ImageOrVideoDatasetWithResizeAndRectangleCrop: self.max_num_frames = ", self.max_num_frames)
-        print(f"ImageOrVideoDatasetWithResizeAndRectangleCrop: video_num_frames = ", video_num_frames)
-        video_buckets = [bucket for bucket in self.resolution_buckets if bucket[0] <= video_num_frames]
-        if not video_buckets:
-            _, h, w = self.resolution_buckets[0]
-            video_buckets = [(1, h, w)]
-        nearest_frame_bucket = min(
-            video_buckets,
-            key=lambda x: abs(x[0] - min(video_num_frames, self.max_num_frames)),
-            default=video_buckets[0],
-        )[0]
-        frame_indices = list(range(0, video_num_frames, video_num_frames // nearest_frame_bucket))
-        frames = video_reader.get_batch(frame_indices)
-        frames = frames[:nearest_frame_bucket].float()
-        frames = frames.permute(0, 3, 1, 2).contiguous()
-        nearest_res = self._find_nearest_resolution(frames.shape[2], frames.shape[3])
-        frames_resized = self._resize_for_rectangle_crop(frames, nearest_res)
-        frames = torch.stack([self.video_transforms(frame) for frame in frames_resized], dim=0)
-        return frames
     def _find_nearest_resolution(self, height, width):
         nearest_res = min(self.resolutions, key=lambda x: abs(x[1] - height) + abs(x[2] - width))
         return nearest_res[1], nearest_res[2]

 from torchvision.transforms import InterpolationMode
 from torchvision.transforms.functional import resize
+import gc
+import time
+import resource
 # Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
 # Very few bug reports but it happens. Look in decord Github issues for more relevant information.
 )
+# Decord is causing us some issues!
+# Let's try to increase file descriptor limits to avoid this error:
+#
+#     decord._ffi.base.DECORDError: Resource temporarily unavailable
+try:
+    soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+    logger.info(f"Current file descriptor limits: soft={soft}, hard={hard}")
+    # Try to increase to hard limit if possible
+    if soft < hard:
+        resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
+        new_soft, new_hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+        logger.info(f"Updated file descriptor limits: soft={new_soft}, hard={new_hard}")
+except Exception as e:
+    logger.warning(f"Could not check or update file descriptor limits: {e}")
 logger = get_logger(__name__)
         return image
     def _preprocess_video(self, path: Path) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
         Loads a single video, or latent and prompt embedding, based on initialization parameters.
         Returns a [F, C, H, W] video tensor.
         """
+        max_retries = 3
+        retry_delay = 1.0  # seconds
+        for attempt in range(max_retries):
+            try:
+                # Create video reader
+                video_reader = decord.VideoReader(uri=path.as_posix())
+                video_num_frames = len(video_reader)
+                # Process frames
+                indices = list(range(0, video_num_frames, video_num_frames // self.max_num_frames))
+                frames = video_reader.get_batch(indices)
+                frames = frames[: self.max_num_frames].float()
+                frames = frames.permute(0, 3, 1, 2).contiguous()
+                frames = torch.stack([self.video_transforms(frame) for frame in frames], dim=0)
+                # Explicitly clean up resources
+                del video_reader
+                # Force garbage collection occasionally
+                if random.random() < 0.05:  # 5% chance
+                    gc.collect()
+                return frames
+            except decord._ffi.base.DECORDError as e:
+                # Log the error
+                error_msg = str(e)
+                if "Resource temporarily unavailable" in error_msg and attempt < max_retries - 1:
+                    logger.warning(f"Retry {attempt+1}/{max_retries} loading video {path}: {error_msg}")
+                    # Clean up and wait before retrying
+                    gc.collect()
+                    time.sleep(retry_delay * (attempt + 1))  # Increasing backoff
+                else:
+                    # Either not a resource error or we've run out of retries
+                    logger.error(f"Failed to load video {path} after {attempt+1} attempts: {error_msg}")
+                    raise RuntimeError(f"Failed to load video after {max_retries} attempts: {error_msg}")
 class ImageOrVideoDatasetWithResizing(ImageOrVideoDataset):
         return image
     def _preprocess_video(self, path: Path) -> torch.Tensor:
+        max_retries = 3
+        retry_delay = 1.0  # seconds
+        for attempt in range(max_retries):
+            try:
+                # Create video reader
+                video_reader = decord.VideoReader(uri=path.as_posix())
+                video_num_frames = len(video_reader)
+                # Find appropriate bucket for the video
+                video_buckets = [bucket for bucket in self.resolution_buckets if bucket[0] <= video_num_frames]
+                if not video_buckets:
+                    _, h, w = self.resolution_buckets[0]
+                    video_buckets = [(1, h, w)]
+                nearest_frame_bucket = min(
+                    video_buckets,
+                    key=lambda x: abs(x[0] - min(video_num_frames, self.max_num_frames)),
+                    default=video_buckets[0],
+                )[0]
+                # Extract and process frames
+                frame_indices = list(range(0, video_num_frames, video_num_frames // nearest_frame_bucket))
+                frames = video_reader.get_batch(frame_indices)
+                frames = frames[:nearest_frame_bucket].float()
+                frames = frames.permute(0, 3, 1, 2).contiguous()
+                nearest_res = self._find_nearest_resolution(frames.shape[2], frames.shape[3])
+                frames_resized = torch.stack([resize(frame, nearest_res) for frame in frames], dim=0)
+                frames = torch.stack([self.video_transforms(frame) for frame in frames_resized], dim=0)
+                # Explicitly clean up resources
+                del video_reader
+                # Force garbage collection occasionally
+                if random.random() < 0.05:  # 5% chance
+                    gc.collect()
+                return frames
+            except decord._ffi.base.DECORDError as e:
+                # Log the error
+                error_msg = str(e)
+                if "Resource temporarily unavailable" in error_msg and attempt < max_retries - 1:
+                    logger.warning(f"Retry {attempt+1}/{max_retries} loading video {path}: {error_msg}")
+                    # Clean up and wait before retrying
+                    gc.collect()
+                    time.sleep(retry_delay * (attempt + 1))  # Increasing backoff
+                else:
+                    # Either not a resource error or we've run out of retries
+                    logger.error(f"Failed to load video {path} after {attempt+1} attempts: {error_msg}")
+                    raise RuntimeError(f"Failed to load video after {max_retries} attempts: {error_msg}")
     def _find_nearest_resolution(self, height, width):
         nearest_res = min(self.resolution_buckets, key=lambda x: abs(x[1] - height) + abs(x[2] - width))
         return arr
     def _preprocess_video(self, path: Path) -> torch.Tensor:
+        max_retries = 3
+        retry_delay = 1.0  # seconds
+        for attempt in range(max_retries):
+            try:
+                # Create video reader
+                video_reader = decord.VideoReader(uri=path.as_posix())
+                video_num_frames = len(video_reader)
+                # Find appropriate bucket for the video
+                video_buckets = [bucket for bucket in self.resolution_buckets if bucket[0] <= video_num_frames]
+                if not video_buckets:
+                    _, h, w = self.resolution_buckets[0]
+                    video_buckets = [(1, h, w)]
+                nearest_frame_bucket = min(
+                    video_buckets,
+                    key=lambda x: abs(x[0] - min(video_num_frames, self.max_num_frames)),
+                    default=video_buckets[0],
+                )[0]
+                # Extract and process frames
+                frame_indices = list(range(0, video_num_frames, video_num_frames // nearest_frame_bucket))
+                frames = video_reader.get_batch(frame_indices)
+                frames = frames[:nearest_frame_bucket].float()
+                frames = frames.permute(0, 3, 1, 2).contiguous()
+                # Fix: Change self.resolutions to self.resolution_buckets to match the class attribute
+                nearest_res = self._find_nearest_resolution(frames.shape[2], frames.shape[3])
+                frames_resized = self._resize_for_rectangle_crop(frames, nearest_res)
+                frames = torch.stack([self.video_transforms(frame) for frame in frames_resized], dim=0)
+                # Explicitly clean up resources
+                del video_reader
+                # Force garbage collection occasionally
+                if random.random() < 0.05:  # 5% chance
+                    gc.collect()
+                return frames
+            except decord._ffi.base.DECORDError as e:
+                # Log the error
+                error_msg = str(e)
+                if "Resource temporarily unavailable" in error_msg and attempt < max_retries - 1:
+                    logger.warning(f"Retry {attempt+1}/{max_retries} loading video {path}: {error_msg}")
+                    # Clean up and wait before retrying
+                    gc.collect()
+                    time.sleep(retry_delay * (attempt + 1))  # Increasing backoff
+                else:
+                    # Either not a resource error or we've run out of retries
+                    logger.error(f"Failed to load video {path} after {attempt+1} attempts: {error_msg}")
+                    raise RuntimeError(f"Failed to load video after {max_retries} attempts: {error_msg}")
     def _find_nearest_resolution(self, height, width):
         nearest_res = min(self.resolutions, key=lambda x: abs(x[1] - height) + abs(x[2] - width))
         return nearest_res[1], nearest_res[2]

finetrainers/trainer.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import logging
 import math
 import os
 import random
 from datetime import datetime, timedelta
 from pathlib import Path
@@ -549,6 +550,20 @@ class Trainer:
     def train(self) -> None:
         logger.info("Starting training")
         memory_statistics = get_memory_statistics()
         logger.info(f"Memory before training start: {json.dumps(memory_statistics, indent=4)}")
@@ -816,9 +831,15 @@ class Trainer:
                 progress_bar.set_postfix(logs)
                 accelerator.log(logs, step=global_step)
                 if global_step >= self.state.train_steps:
                     break
             if num_loss_updates > 0:
                 epoch_loss /= num_loss_updates
             accelerator.log({"epoch_loss": epoch_loss}, step=global_step)
@@ -833,6 +854,13 @@ class Trainer:
             if should_run_validation:
                 self.validate(global_step)
         accelerator.wait_for_everyone()
         if accelerator.is_main_process:
             transformer = unwrap_model(accelerator, self.transformer)

 import logging
 import math
 import os
+import gc
 import random
 from datetime import datetime, timedelta
 from pathlib import Path
     def train(self) -> None:
         logger.info("Starting training")
+        # Add these lines at the beginning
+        if hasattr(resource, 'RLIMIT_NOFILE'):
+            try:
+                soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+                logger.info(f"Current file descriptor limits in trainer: soft={soft}, hard={hard}")
+                # Try to increase to hard limit if possible
+                if soft < hard:
+                    resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
+                    new_soft, new_hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+                    logger.info(f"Updated file descriptor limits: soft={new_soft}, hard={new_hard}")
+            except Exception as e:
+                logger.warning(f"Could not check or update file descriptor limits: {e}")
         memory_statistics = get_memory_statistics()
         logger.info(f"Memory before training start: {json.dumps(memory_statistics, indent=4)}")
                 progress_bar.set_postfix(logs)
                 accelerator.log(logs, step=global_step)
+                if global_step % 100 == 0:  # Every 100 steps
+                    # Force garbage collection to clean up any lingering resources
+                    gc.collect()
                 if global_step >= self.state.train_steps:
                     break
             if num_loss_updates > 0:
                 epoch_loss /= num_loss_updates
             accelerator.log({"epoch_loss": epoch_loss}, step=global_step)
             if should_run_validation:
                 self.validate(global_step)
+            if epoch % 3 == 0:  # Every 3 epochs
+                logger.info("Performing periodic resource cleanup")
+                free_memory()
+                gc.collect()
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize(accelerator.device)
         accelerator.wait_for_everyone()
         if accelerator.is_main_process:
             transformer = unwrap_model(accelerator, self.transformer)

training/cogvideox/dataset.py CHANGED Viewed

@@ -57,7 +57,7 @@ class VideoDataset(Dataset):
         self.random_flip = random_flip
         self.image_to_video = image_to_video
-        self.resolutions = [
             (f, h, w) for h in self.height_buckets for w in self.width_buckets for f in self.frame_buckets
         ]
@@ -295,7 +295,7 @@ class VideoDatasetWithResizing(VideoDataset):
             return image, frames, None
     def _find_nearest_resolution(self, height, width):
-        nearest_res = min(self.resolutions, key=lambda x: abs(x[1] - height) + abs(x[2] - width))
         return nearest_res[1], nearest_res[2]

         self.random_flip = random_flip
         self.image_to_video = image_to_video
+        self.resolution_buckets = [
             (f, h, w) for h in self.height_buckets for w in self.width_buckets for f in self.frame_buckets
         ]
             return image, frames, None
     def _find_nearest_resolution(self, height, width):
+        nearest_res = min(self.resolution_buckets, key=lambda x: abs(x[1] - height) + abs(x[2] - width))
         return nearest_res[1], nearest_res[2]