Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke commited on Mar 11

Commit

1042322

1 Parent(s): 57737a0

fixed some bugs with finetrainers CLI params

Browse files

Files changed (4) hide show

vms/config.py +3 -0
vms/services/trainer.py +25 -22
vms/tabs/train_tab.py +13 -5
vms/ui/video_trainer_ui.py +1 -1

vms/config.py CHANGED Viewed

@@ -485,6 +485,9 @@ class TrainingConfig:
         if self.precompute_conditions:
             args.append("--precompute_conditions")
         # Diffusion arguments
         if self.flow_resolution_shifting:
             args.append("--flow_resolution_shifting")

         if self.precompute_conditions:
             args.append("--precompute_conditions")
+        if hasattr(self, 'precomputation_items') and self.precomputation_items:
+            args.extend(["--precomputation_items", str(self.precomputation_items)])
         # Diffusion arguments
         if self.flow_resolution_shifting:
             args.append("--flow_resolution_shifting")

vms/services/trainer.py CHANGED Viewed

@@ -52,7 +52,10 @@ from ..utils import (
 logger = logging.getLogger(__name__)
 class TrainingService:
-    def __init__(self):
         # State and log files
         self.session_file = OUTPUT_PATH / "session.json"
         self.status_file = OUTPUT_PATH / "status.json"
@@ -565,8 +568,8 @@ class TrainingService:
         logger.info(f"{log_prefix} training with model_type={model_type}, training_type={training_type}")
         # Update progress if available
-        if progress:
-            progress(0.15, desc="Setting up training configuration")
         try:
             # Get absolute paths - FIXED to look in project root instead of within vms directory
@@ -598,8 +601,8 @@ class TrainingService:
             logger.info("Training data path: %s", TRAINING_PATH)
             # Update progress
-            if progress:
-                progress(0.2, desc="Preparing training dataset")
             videos_file, prompts_file = prepare_finetrainers_dataset()
             if videos_file is None or prompts_file is None:
@@ -616,8 +619,8 @@ class TrainingService:
                 return error_msg, "No training data available"
             # Update progress
-            if progress:
-                progress(0.25, desc="Creating dataset configuration")
             # Get preset configuration
             preset = TRAINING_PRESETS[preset_name]
@@ -627,13 +630,14 @@ class TrainingService:
             # Get the custom prompt prefix from the tabs
             custom_prompt_prefix = None
-            if hasattr(self.app, 'tabs') and 'caption_tab' in self.app.tabs:
-                if hasattr(self.app.tabs['caption_tab'], 'components') and 'custom_prompt_prefix' in self.app.tabs['caption_tab'].components:
-                    # Get the value and clean it
-                    prefix = self.app.tabs['caption_tab'].components['custom_prompt_prefix'].value
-                    if prefix:
-                        # Clean the prefix - remove trailing comma, space or comma+space
-                        custom_prompt_prefix = prefix.rstrip(', ')
             # Create a proper dataset configuration JSON file
             dataset_config_file = OUTPUT_PATH / "dataset_config.json"
@@ -725,10 +729,7 @@ class TrainingService:
             config.flow_weighting_scheme = flow_weighting_scheme
             config.lr_warmup_steps = int(lr_warmup_steps)
-            config_args.extend([
-                "--precomputation_items", str(precomputation_items)
-            ])
             # Update the NUM_GPUS variable and CUDA_VISIBLE_DEVICES
             num_gpus = min(num_gpus, get_available_gpu_count())
             if num_gpus <= 0:
@@ -757,6 +758,8 @@ class TrainingService:
             config.enable_tiling = True
             config.caption_dropout_p = DEFAULT_CAPTION_DROPOUT_P
             validation_error = self.validate_training_config(config, model_type)
             if validation_error:
                 error_msg = f"Configuration validation failed: {validation_error}"
@@ -843,8 +846,8 @@ class TrainingService:
             env["FINETRAINERS_LOG_LEVEL"] = "DEBUG"  # Added for better debugging
             env["CUDA_VISIBLE_DEVICES"] = visible_devices
-            if progress:
-                progress(0.9, desc="Launching training process")
             # Start the training process
             process = subprocess.Popen(
@@ -901,8 +904,8 @@ class TrainingService:
             logger.info(success_msg)
             # Final progress update - now we'll track it through the log monitor
-            if progress:
-                progress(1.0, desc="Training started successfully")
             return success_msg, self.get_logs()

 logger = logging.getLogger(__name__)
 class TrainingService:
+    def __init__(self, app=None):
+        # Store reference to app
+        self.app = app
         # State and log files
         self.session_file = OUTPUT_PATH / "session.json"
         self.status_file = OUTPUT_PATH / "status.json"
         logger.info(f"{log_prefix} training with model_type={model_type}, training_type={training_type}")
         # Update progress if available
+        #if progress:
+        #    progress(0.15, desc="Setting up training configuration")
         try:
             # Get absolute paths - FIXED to look in project root instead of within vms directory
             logger.info("Training data path: %s", TRAINING_PATH)
             # Update progress
+            #if progress:
+            #    progress(0.2, desc="Preparing training dataset")
             videos_file, prompts_file = prepare_finetrainers_dataset()
             if videos_file is None or prompts_file is None:
                 return error_msg, "No training data available"
             # Update progress
+            #if progress:
+            #    progress(0.25, desc="Creating dataset configuration")
             # Get preset configuration
             preset = TRAINING_PRESETS[preset_name]
             # Get the custom prompt prefix from the tabs
             custom_prompt_prefix = None
+            if hasattr(self, 'app') and self.app is not None:
+                if hasattr(self.app, 'tabs') and 'caption_tab' in self.app.tabs:
+                    if hasattr(self.app.tabs['caption_tab'], 'components') and 'custom_prompt_prefix' in self.app.tabs['caption_tab'].components:
+                        # Get the value and clean it
+                        prefix = self.app.tabs['caption_tab'].components['custom_prompt_prefix'].value
+                        if prefix:
+                            # Clean the prefix - remove trailing comma, space or comma+space
+                            custom_prompt_prefix = prefix.rstrip(', ')
             # Create a proper dataset configuration JSON file
             dataset_config_file = OUTPUT_PATH / "dataset_config.json"
             config.flow_weighting_scheme = flow_weighting_scheme
             config.lr_warmup_steps = int(lr_warmup_steps)
             # Update the NUM_GPUS variable and CUDA_VISIBLE_DEVICES
             num_gpus = min(num_gpus, get_available_gpu_count())
             if num_gpus <= 0:
             config.enable_tiling = True
             config.caption_dropout_p = DEFAULT_CAPTION_DROPOUT_P
+            config.precomputation_items = precomputation_items
             validation_error = self.validate_training_config(config, model_type)
             if validation_error:
                 error_msg = f"Configuration validation failed: {validation_error}"
             env["FINETRAINERS_LOG_LEVEL"] = "DEBUG"  # Added for better debugging
             env["CUDA_VISIBLE_DEVICES"] = visible_devices
+            #if progress:
+            #    progress(0.9, desc="Launching training process")
             # Start the training process
             process = subprocess.Popen(
             logger.info(success_msg)
             # Final progress update - now we'll track it through the log monitor
+            #if progress:
+            #    progress(1.0, desc="Training started successfully")
             return success_msg, self.get_logs()

vms/tabs/train_tab.py CHANGED Viewed

@@ -384,7 +384,9 @@ class TrainTab(BaseTab):
             outputs=[self.components["status_box"]]
         )
-    def handle_training_start(self, preset, model_type, training_type, *args, progress=gr.Progress()):
         """Handle training start with proper log parser reset and checkpoint detection"""
         # Safely reset log parser if it exists
         if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
@@ -395,7 +397,7 @@ class TrainTab(BaseTab):
             self.app.log_parser = TrainingLogParser()
         # Initialize progress
-        progress(0, desc="Initializing training")
         # Check for latest checkpoint
         checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
@@ -406,9 +408,10 @@ class TrainTab(BaseTab):
             latest_checkpoint = max(checkpoints, key=os.path.getmtime)
             resume_from = str(latest_checkpoint)
             logger.info(f"Found checkpoint at {resume_from}, will resume training")
-            progress(0.05, desc=f"Resuming from checkpoint {Path(resume_from).name}")
         else:
-            progress(0.05, desc="Starting new training run")
         # Convert model_type display name to internal name
         model_internal_type = MODEL_TYPES.get(model_type)
@@ -424,8 +427,13 @@ class TrainTab(BaseTab):
             logger.error(f"Invalid training type: {training_type}")
             return f"Error: Invalid training type '{training_type}'", "Training type not recognized"
         # Progress update
-        progress(0.1, desc="Preparing dataset")
         # Start training (it will automatically use the checkpoint if provided)
         try:

             outputs=[self.components["status_box"]]
         )
+    def handle_training_start(
+        self, preset, model_type, training_type, lora_rank, lora_alpha, train_steps, batch_size, learning_rate, save_iterations, repo_id, progress=gr.Progress()
+    ):
         """Handle training start with proper log parser reset and checkpoint detection"""
         # Safely reset log parser if it exists
         if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
             self.app.log_parser = TrainingLogParser()
         # Initialize progress
+        #progress(0, desc="Initializing training")
         # Check for latest checkpoint
         checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
             latest_checkpoint = max(checkpoints, key=os.path.getmtime)
             resume_from = str(latest_checkpoint)
             logger.info(f"Found checkpoint at {resume_from}, will resume training")
+            #progress(0.05, desc=f"Resuming from checkpoint {Path(resume_from).name}")
         else:
+            #progress(0.05, desc="Starting new training run")
+            pass
         # Convert model_type display name to internal name
         model_internal_type = MODEL_TYPES.get(model_type)
             logger.error(f"Invalid training type: {training_type}")
             return f"Error: Invalid training type '{training_type}'", "Training type not recognized"
+        # Get other parameters from UI form
+        num_gpus = int(self.components["num_gpus"].value)
+        precomputation_items = int(self.components["precomputation_items"].value)
+        lr_warmup_steps = int(self.components["lr_warmup_steps"].value)
         # Progress update
+        #progress(0.1, desc="Preparing dataset")
         # Start training (it will automatically use the checkpoint if provided)
         try:

vms/ui/video_trainer_ui.py CHANGED Viewed

@@ -40,7 +40,7 @@ class VideoTrainerUI:
     def __init__(self):
         """Initialize services and tabs"""
         # Initialize core services
-        self.trainer = TrainingService()
         self.splitter = SplittingService()
         self.importer = ImportService()
         self.captioner = CaptioningService()

     def __init__(self):
         """Initialize services and tabs"""
         # Initialize core services
+        self.trainer = TrainingService(self)
         self.splitter = SplittingService()
         self.importer = ImportService()
         self.captioner = CaptioningService()