Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 2

Commit

c8589f9

1 Parent(s): adc5756

various fixes regarding session recovery

Browse files

Files changed (4) hide show

finetrainers/dataset.py +1 -1
vms/services/trainer.py +134 -132
vms/tabs/train_tab.py +55 -9
vms/ui/video_trainer_ui.py +30 -5

finetrainers/dataset.py CHANGED Viewed

@@ -32,6 +32,7 @@ from .constants import (  # noqa
     PRECOMPUTED_LATENTS_DIR_NAME,
 )
 # Decord is causing us some issues!
 # Let's try to increase file descriptor limits to avoid this error:
@@ -49,7 +50,6 @@ try:
 except Exception as e:
     logger.warning(f"Could not check or update file descriptor limits: {e}")
-logger = get_logger(__name__)
 # TODO(aryan): This needs a refactor with separation of concerns.

     PRECOMPUTED_LATENTS_DIR_NAME,
 )
+logger = get_logger(__name__)
 # Decord is causing us some issues!
 # Let's try to increase file descriptor limits to avoid this error:
 except Exception as e:
     logger.warning(f"Could not check or update file descriptor limits: {e}")
 # TODO(aryan): This needs a refactor with separation of concerns.

vms/services/trainer.py CHANGED Viewed

@@ -637,149 +637,151 @@ class TrainingService:
             return False
     def recover_interrupted_training(self) -> Dict[str, Any]:
-        """Attempt to recover interrupted training
-        Returns:
-            Dict with recovery status and UI updates
-        """
-        status = self.get_status()
-        ui_updates = {}
-        # Check for any checkpoints, even if status doesn't indicate training
-        checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
-        has_checkpoints = len(checkpoints) > 0
-        # If status indicates training but process isn't running, or if we have checkpoints
-        # and no active training process, try to recover
-        if (status.get('status') in ['training', 'paused'] and not self.is_training_running()) or \
-        (has_checkpoints and not self.is_training_running()):
-            logger.info("Detected interrupted training session or existing checkpoints, attempting to recover...")
-            # Get the latest checkpoint
-            last_session = self.load_session()
-            if not last_session:
-                logger.warning("No session data found for recovery, but will check for checkpoints")
-                # Try to create a default session based on UI state if we have checkpoints
-                if has_checkpoints:
-                    ui_state = self.load_ui_state()
-                    # Create a default session using UI state values
-                    last_session = {
-                        "params": {
-                            "model_type": MODEL_TYPES.get(ui_state.get("model_type", list(MODEL_TYPES.keys())[0])),
-                            "lora_rank": ui_state.get("lora_rank", "128"),
-                            "lora_alpha": ui_state.get("lora_alpha", "128"),
-                            "num_epochs": ui_state.get("num_epochs", 70),
-                            "batch_size": ui_state.get("batch_size", 1),
-                            "learning_rate": ui_state.get("learning_rate", 3e-5),
-                            "save_iterations": ui_state.get("save_iterations", 500),
-                            "preset_name": ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
-                            "repo_id": ""  # Default empty repo ID
-                        }
-                    }
-                    logger.info("Created default session from UI state for recovery")
-                else:
-                    # Set buttons for no active training
-                    ui_updates = {
-                        "start_btn": {"interactive": True, "variant": "primary", "value": "Start Training"},
-                        "stop_btn": {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"},
-                        "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
-                    }
-                    return {"status": "idle", "message": "No training in progress", "ui_updates": ui_updates}
-            # Find the latest checkpoint if we have checkpoints
-            latest_checkpoint = None
-            checkpoint_step = 0
             if has_checkpoints:
-                latest_checkpoint = max(checkpoints, key=os.path.getmtime)
-                checkpoint_step = int(latest_checkpoint.name.split("-")[1])
-                logger.info(f"Found checkpoint at step {checkpoint_step}")
             else:
-                logger.warning("No checkpoints found for recovery")
                 # Set buttons for no active training
                 ui_updates = {
                     "start_btn": {"interactive": True, "variant": "primary", "value": "Start Training"},
                     "stop_btn": {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"},
                     "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
                 }
-                return {"status": "error", "message": "No checkpoints found", "ui_updates": ui_updates}
-            # Extract parameters from the saved session (not current UI state)
-            # This ensures we use the original training parameters
-            params = last_session.get('params', {})
-            # Add UI updates to restore the training parameters in the UI
-            # This shows the user what values are being used for the resumed training
-            ui_updates.update({
-                "model_type": params.get('model_type', list(MODEL_TYPES.keys())[0]),
-                "lora_rank": params.get('lora_rank', "128"),
-                "lora_alpha": params.get('lora_alpha', "128"),
-                "num_epochs": params.get('num_epochs', 70),
-                "batch_size": params.get('batch_size', 1),
-                "learning_rate": params.get('learning_rate', 3e-5),
-                "save_iterations": params.get('save_iterations', 500),
-                "training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
-            })
-            # Check if we should auto-recover (immediate restart)
-            auto_recover = True  # Always auto-recover on startup
-            if auto_recover:
-                # Attempt to resume training using the ORIGINAL parameters
-                try:
-                    # Extract required parameters from the session
-                    model_type = params.get('model_type')
-                    lora_rank = params.get('lora_rank')
-                    lora_alpha = params.get('lora_alpha')
-                    num_epochs = params.get('num_epochs')
-                    batch_size = params.get('batch_size')
-                    learning_rate = params.get('learning_rate')
-                    save_iterations = params.get('save_iterations')
-                    repo_id = params.get('repo_id', '')
-                    preset_name = params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
-                    # Log the recovery attempt
-                    self.append_log(f"Auto-recovering training from checkpoint {checkpoint_step}")
-                    gr.Info(f"Automatically resuming training from checkpoint {checkpoint_step}")
-                    # Attempt to resume training
-                    result = self.start_training(
-                        model_type=model_type,
-                        lora_rank=lora_rank,
-                        lora_alpha=lora_alpha,
-                        num_epochs=num_epochs,
-                        batch_size=batch_size,
-                        learning_rate=learning_rate,
-                        save_iterations=save_iterations,
-                        repo_id=repo_id,
-                        preset_name=preset_name,
-                        resume_from_checkpoint=str(latest_checkpoint)
-                    )
-                    # Set buttons for active training
-                    ui_updates.update({
-                        "start_btn": {"interactive": False, "variant": "secondary", "value": "Continue Training"},
-                        "stop_btn": {"interactive": True, "variant": "primary", "value": "Stop at Last Checkpoint"},
-                        "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
-                    })
-                    return {
-                        "status": "recovered",
-                        "message": f"Training resumed from checkpoint {checkpoint_step}",
-                        "result": result,
-                        "ui_updates": ui_updates
-                    }
-                except Exception as e:
-                    logger.error(f"Failed to auto-resume training: {str(e)}")
-                    # Set buttons for manual recovery
-                    ui_updates.update({
-                        "start_btn": {"interactive": True, "variant": "primary", "value": "Continue Training"},
-                        "stop_btn": {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"},
-                        "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
-                    })
-                    return {"status": "error", "message": f"Failed to auto-resume: {str(e)}", "ui_updates": ui_updates}
             else:
                 # Set up UI for manual recovery
                 ui_updates.update({

             return False
     def recover_interrupted_training(self) -> Dict[str, Any]:
+    """Attempt to recover interrupted training
+    Returns:
+        Dict with recovery status and UI updates
+    """
+    status = self.get_status()
+    ui_updates = {}
+    # Check for any checkpoints, even if status doesn't indicate training
+    checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
+    has_checkpoints = len(checkpoints) > 0
+    # If status indicates training but process isn't running, or if we have checkpoints
+    # and no active training process, try to recover
+    if (status.get('status') in ['training', 'paused'] and not self.is_training_running()) or \
+       (has_checkpoints and not self.is_training_running()):
+        logger.info("Detected interrupted training session or existing checkpoints, attempting to recover...")
+        # Get the latest checkpoint
+        last_session = self.load_session()
+        if not last_session:
+            logger.warning("No session data found for recovery, but will check for checkpoints")
+            # Try to create a default session based on UI state if we have checkpoints
             if has_checkpoints:
+                ui_state = self.load_ui_state()
+                # Create a default session using UI state values
+                last_session = {
+                    "params": {
+                        "model_type": MODEL_TYPES.get(ui_state.get("model_type", list(MODEL_TYPES.keys())[0])),
+                        "lora_rank": ui_state.get("lora_rank", "128"),
+                        "lora_alpha": ui_state.get("lora_alpha", "128"),
+                        "num_epochs": ui_state.get("num_epochs", 70),
+                        "batch_size": ui_state.get("batch_size", 1),
+                        "learning_rate": ui_state.get("learning_rate", 3e-5),
+                        "save_iterations": ui_state.get("save_iterations", 500),
+                        "preset_name": ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
+                        "repo_id": ""  # Default empty repo ID
+                    }
+                }
+                logger.info("Created default session from UI state for recovery")
             else:
                 # Set buttons for no active training
                 ui_updates = {
                     "start_btn": {"interactive": True, "variant": "primary", "value": "Start Training"},
                     "stop_btn": {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"},
+                    "delete_checkpoints_btn": {"interactive": False, "variant": "stop", "value": "Delete All Checkpoints"},
                     "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
                 }
+                return {"status": "idle", "message": "No training in progress", "ui_updates": ui_updates}
+        # Find the latest checkpoint if we have checkpoints
+        latest_checkpoint = None
+        checkpoint_step = 0
+        if has_checkpoints:
+            latest_checkpoint = max(checkpoints, key=os.path.getmtime)
+            checkpoint_step = int(latest_checkpoint.name.split("-")[1])
+            logger.info(f"Found checkpoint at step {checkpoint_step}")
+        else:
+            logger.warning("No checkpoints found for recovery")
+            # Set buttons for no active training
+            ui_updates = {
+                "start_btn": {"interactive": True, "variant": "primary", "value": "Start Training"},
+                "stop_btn": {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"},
+                "delete_checkpoints_btn": {"interactive": False, "variant": "stop", "value": "Delete All Checkpoints"},
+                "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
+            }
+            return {"status": "error", "message": "No checkpoints found", "ui_updates": ui_updates}
+        # Extract parameters from the saved session (not current UI state)
+        # This ensures we use the original training parameters
+        params = last_session.get('params', {})
+        # Map internal model type back to display name for UI
+        # This is the key fix for the "ltx_video" vs "LTX-Video (LoRA)" mismatch
+        model_type_internal = params.get('model_type')
+        model_type_display = model_type_internal
+        # Find the display name that maps to our internal model type
+        for display_name, internal_name in MODEL_TYPES.items():
+            if internal_name == model_type_internal:
+                model_type_display = display_name
+                logger.info(f"Mapped internal model type '{model_type_internal}' to display name '{model_type_display}'")
+                break
+        # Add UI updates to restore the training parameters in the UI
+        # This shows the user what values are being used for the resumed training
+        ui_updates.update({
+            "model_type": model_type_display,  # Use the display name for the UI dropdown
+            "lora_rank": params.get('lora_rank', "128"),
+            "lora_alpha": params.get('lora_alpha', "128"),
+            "num_epochs": params.get('num_epochs', 70),
+            "batch_size": params.get('batch_size', 1),
+            "learning_rate": params.get('learning_rate', 3e-5),
+            "save_iterations": params.get('save_iterations', 500),
+            "training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
+        })
+        # Check if we should auto-recover (immediate restart)
+        auto_recover = True  # Always auto-recover on startup
+        if auto_recover:
+            # Rest of the auto-recovery code remains unchanged
+            try:
+                # Use the internal model_type for the actual training
+                # But keep model_type_display for the UI
+                result = self.start_training(
+                    model_type=model_type_internal,
+                    lora_rank=params.get('lora_rank', "128"),
+                    lora_alpha=params.get('lora_alpha', "128"),
+                    num_epochs=params.get('num_epochs', 70),
+                    batch_size=params.get('batch_size', 1),
+                    learning_rate=params.get('learning_rate', 3e-5),
+                    save_iterations=params.get('save_iterations', 500),
+                    repo_id=params.get('repo_id', ''),
+                    preset_name=params.get('preset_name', list(TRAINING_PRESETS.keys())[0]),
+                    resume_from_checkpoint=str(latest_checkpoint)
+                )
+                # Set buttons for active training
+                ui_updates.update({
+                    "start_btn": {"interactive": False, "variant": "secondary", "value": "Continue Training"},
+                    "stop_btn": {"interactive": True, "variant": "primary", "value": "Stop at Last Checkpoint"},
+                    "delete_checkpoints_btn": {"interactive": False, "variant": "stop", "value": "Delete All Checkpoints"},
+                    "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
+                })
+                return {
+                    "status": "recovered",
+                    "message": f"Training resumed from checkpoint {checkpoint_step}",
+                    "result": result,
+                    "ui_updates": ui_updates
+                }
+            except Exception as e:
+                logger.error(f"Failed to auto-resume training: {str(e)}")
+                # Set buttons for manual recovery
+                ui_updates.update({
+                    "start_btn": {"interactive": True, "variant": "primary", "value": "Continue Training"},
+                    "stop_btn": {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"},
+                    "delete_checkpoints_btn": {"interactive": True, "variant": "stop", "value": "Delete All Checkpoints"},
+                    "pause_resume_btn": {"interactive": False, "variant": "secondary", "visible": False}
+                })
+                return {"status": "error", "message": f"Failed to auto-resume: {str(e)}", "ui_updates": ui_updates}
             else:
                 # Set up UI for manual recovery
                 ui_updates.update({

vms/tabs/train_tab.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Dict, Any, List, Optional, Tuple
 from pathlib import Path
 from .base_tab import BaseTab
-from ..config import TRAINING_PRESETS, MODEL_TYPES, ASK_USER_TO_DUPLICATE_SPACE, SMALL_TRAINING_BUCKETS
 from ..utils import TrainingLogParser
 logger = logging.getLogger(__name__)
@@ -279,7 +279,7 @@ class TrainTab(BaseTab):
         )
     def handle_training_start(self, preset, model_type, *args):
-        """Handle training start with proper log parser reset"""
         # Safely reset log parser if it exists
         if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
             self.app.log_parser.reset()
@@ -288,12 +288,35 @@ class TrainTab(BaseTab):
             from ..utils import TrainingLogParser
             self.app.log_parser = TrainingLogParser()
-        # Start training
-        return self.app.trainer.start_training(
-            MODEL_TYPES[model_type],
-            *args,
-            preset_name=preset
-        )
     def get_model_info(self, model_type: str) -> str:
         """Get information about the selected model type"""
@@ -455,6 +478,23 @@ class TrainTab(BaseTab):
         state = self.app.trainer.get_status()
         logs = self.app.trainer.get_logs()
         # Ensure log parser is initialized
         if not hasattr(self.app, 'log_parser') or self.app.log_parser is None:
             from ..utils import TrainingLogParser
@@ -462,7 +502,7 @@ class TrainTab(BaseTab):
             logger.info("Initialized missing log parser")
         # Parse new log lines
-        if logs:
             last_state = None
             for line in logs.splitlines():
                 try:
@@ -480,6 +520,12 @@ class TrainTab(BaseTab):
         # Parse status for training state
         if "completed" in state["message"].lower():
             state["status"] = "completed"
         return (state["status"], state["message"], logs)

 from pathlib import Path
 from .base_tab import BaseTab
+from ..config import TRAINING_PRESETS, OUTPUT_PATH, MODEL_TYPES, ASK_USER_TO_DUPLICATE_SPACE, SMALL_TRAINING_BUCKETS
 from ..utils import TrainingLogParser
 logger = logging.getLogger(__name__)
         )
     def handle_training_start(self, preset, model_type, *args):
+        """Handle training start with proper log parser reset and checkpoint detection"""
         # Safely reset log parser if it exists
         if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
             self.app.log_parser.reset()
             from ..utils import TrainingLogParser
             self.app.log_parser = TrainingLogParser()
+        # Check for latest checkpoint
+        checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
+        resume_from = None
+        if checkpoints:
+            # Find the latest checkpoint
+            latest_checkpoint = max(checkpoints, key=os.path.getmtime)
+            resume_from = str(latest_checkpoint)
+            logger.info(f"Found checkpoint at {resume_from}, will resume training")
+        # Convert model_type display name to internal name
+        model_internal_type = MODEL_TYPES.get(model_type)
+        if not model_internal_type:
+            logger.error(f"Invalid model type: {model_type}")
+            return f"Error: Invalid model type '{model_type}'", "Model type not recognized"
+        # Start training (it will automatically use the checkpoint if provided)
+        try:
+            return self.app.trainer.start_training(
+                model_internal_type,  # Use internal model type
+                *args,
+                preset_name=preset,
+                resume_from_checkpoint=resume_from
+            )
+        except Exception as e:
+            logger.exception("Error starting training")
+            return f"Error starting training: {str(e)}", f"Exception: {str(e)}\n\nCheck the logs for more details."
     def get_model_info(self, model_type: str) -> str:
         """Get information about the selected model type"""
         state = self.app.trainer.get_status()
         logs = self.app.trainer.get_logs()
+        # Check if training process died unexpectedly
+        training_died = False
+        if state["status"] == "training" and not self.app.trainer.is_training_running():
+            state["status"] = "error"
+            state["message"] = "Training process terminated unexpectedly."
+            training_died = True
+            # Look for error in logs
+            error_lines = []
+            for line in logs.splitlines():
+                if "Error:" in line or "Exception:" in line or "Traceback" in line:
+                    error_lines.append(line)
+            if error_lines:
+                state["message"] += f"\n\nPossible error: {error_lines[-1]}"
         # Ensure log parser is initialized
         if not hasattr(self.app, 'log_parser') or self.app.log_parser is None:
             from ..utils import TrainingLogParser
             logger.info("Initialized missing log parser")
         # Parse new log lines
+        if logs and not training_died:
             last_state = None
             for line in logs.splitlines():
                 try:
         # Parse status for training state
         if "completed" in state["message"].lower():
             state["status"] = "completed"
+        elif "error" in state["message"].lower():
+            state["status"] = "error"
+        elif "failed" in state["message"].lower():
+            state["status"] = "error"
+        elif "stopped" in state["message"].lower():
+            state["status"] = "stopped"
         return (state["status"], state["message"], logs)

vms/ui/video_trainer_ui.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Optional, Dict, List, Union, Tuple
 from ..services import TrainingService, CaptioningService, SplittingService, ImportService
 from ..config import (
-    STORAGE_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH,
     TRAINING_PATH, LOG_FILE_PATH, TRAINING_PRESETS, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH,
     MODEL_TYPES, SMALL_TRAINING_BUCKETS
 )
@@ -160,7 +160,24 @@ class VideoTrainerUI:
             # If we recovered training parameters from the original session
             ui_state = {}
-            for param in ["model_type", "lora_rank", "lora_alpha", "num_epochs",
                           "batch_size", "learning_rate", "save_iterations", "training_preset"]:
                 if param in recovery_ui:
                     ui_state[param] = recovery_ui[param]
@@ -175,8 +192,16 @@ class VideoTrainerUI:
         # Load values (potentially with recovery updates applied)
         ui_state = self.load_ui_values()
-        training_preset = ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0])
         model_type_val = ui_state.get("model_type", list(MODEL_TYPES.keys())[0])
         lora_rank_val = ui_state.get("lora_rank", "128")
         lora_alpha_val = ui_state.get("lora_alpha", "128")
         num_epochs_val = int(ui_state.get("num_epochs", 70))
@@ -190,9 +215,9 @@ class VideoTrainerUI:
             training_dataset,
             start_btn,
             stop_btn,
-            delete_checkpoints_btn,  # Replaces pause_resume_btn
             training_preset,
-            model_type_val,
             lora_rank_val,
             lora_alpha_val,
             num_epochs_val,

 from ..services import TrainingService, CaptioningService, SplittingService, ImportService
 from ..config import (
+    STORAGE_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, OUTPUT_PATH,
     TRAINING_PATH, LOG_FILE_PATH, TRAINING_PRESETS, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH,
     MODEL_TYPES, SMALL_TRAINING_BUCKETS
 )
             # If we recovered training parameters from the original session
             ui_state = {}
+            # Handle model_type specifically - could be internal or display name
+            if "model_type" in recovery_ui:
+                model_type_value = recovery_ui["model_type"]
+                # If it's an internal name, convert to display name
+                if model_type_value not in MODEL_TYPES:
+                    # Find the display name for this internal model type
+                    for display_name, internal_name in MODEL_TYPES.items():
+                        if internal_name == model_type_value:
+                            model_type_value = display_name
+                            logger.info(f"Converted internal model type '{recovery_ui['model_type']}' to display name '{model_type_value}'")
+                            break
+                ui_state["model_type"] = model_type_value
+            # Copy other parameters
+            for param in ["lora_rank", "lora_alpha", "num_epochs",
                           "batch_size", "learning_rate", "save_iterations", "training_preset"]:
                 if param in recovery_ui:
                     ui_state[param] = recovery_ui[param]
         # Load values (potentially with recovery updates applied)
         ui_state = self.load_ui_values()
+        # Ensure model_type is a display name, not internal name
         model_type_val = ui_state.get("model_type", list(MODEL_TYPES.keys())[0])
+        if model_type_val not in MODEL_TYPES:
+            # Convert from internal to display name
+            for display_name, internal_name in MODEL_TYPES.items():
+                if internal_name == model_type_val:
+                    model_type_val = display_name
+                    break
+        training_preset = ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0])
         lora_rank_val = ui_state.get("lora_rank", "128")
         lora_alpha_val = ui_state.get("lora_alpha", "128")
         num_epochs_val = int(ui_state.get("num_epochs", 70))
             training_dataset,
             start_btn,
             stop_btn,
+            delete_checkpoints_btn,
             training_preset,
+            model_type_val,
             lora_rank_val,
             lora_alpha_val,
             num_epochs_val,