Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Feb 28

Commit

54a2a4e

1 Parent(s): 9545589

working on training job failure recovery

Browse files

Files changed (3) hide show

app.py +124 -4
vms/training_log_parser.py +33 -34
vms/training_service.py +188 -4

app.py CHANGED Viewed

@@ -59,7 +59,43 @@ class VideoTrainerUI:
         self.captioner = CaptioningService()
         self._should_stop_captioning = False
         self.log_parser = TrainingLogParser()
     def update_captioning_buttons_start(self):
         """Return individual button values instead of a dictionary"""
         return (
@@ -1120,12 +1156,55 @@ class VideoTrainerUI:
                 return gr.update(value=repo_id, error=None)
             # Connect events
             model_type.change(
                 fn=update_model_info,
                 inputs=[model_type],
                 outputs=[model_info, num_epochs, batch_size, learning_rate, save_iterations]
             )
             async def on_import_success(enable_splitting, enable_automatic_content_captioning, prompt_prefix):
                 videos = self.list_unprocessed_videos()
                 # If scene detection isn't already running and there are videos to process,
@@ -1243,8 +1322,13 @@ class VideoTrainerUI:
                 fn=self.list_training_files_to_caption,
                 outputs=[training_dataset]
             )
             training_preset.change(
                 fn=self.update_training_params,
                 inputs=[training_preset],
                 outputs=[
@@ -1337,13 +1421,49 @@ class VideoTrainerUI:
                 ]
             )
             # Auto-refresh timers
             app.load(
                 fn=lambda: (
-                    self.refresh_dataset()
                 ),
                 outputs=[
-                    video_list, training_dataset
                 ]
             )

         self.captioner = CaptioningService()
         self._should_stop_captioning = False
         self.log_parser = TrainingLogParser()
+        # Try to recover any interrupted training sessions
+        recovery_result = self.trainer.recover_interrupted_training()
+        self.recovery_status = recovery_result.get("status", "unknown")
+        self.ui_updates = recovery_result.get("ui_updates", {})
+        if recovery_result["status"] == "recovered":
+            logger.info(f"Training recovery: {recovery_result['message']}")
+            # No need to do anything else - the training is already running
+        elif recovery_result["status"] == "running":
+            logger.info("Training process is already running")
+            # No need to do anything - the process is still alive
+        elif recovery_result["status"] in ["error", "idle"]:
+            logger.warning(f"Training status: {recovery_result['message']}")
+            # UI will be in ready-to-start mode
+    def update_ui_state(self, **kwargs):
+        """Update UI state with new values"""
+        current_state = self.trainer.load_ui_state()
+        current_state.update(kwargs)
+        self.trainer.save_ui_state(current_state)
+        return current_state
+    def load_ui_values(self):
+        """Load UI state values for initializing form fields"""
+        ui_state = self.trainer.load_ui_state()
+        # Convert types as needed since JSON stores everything as strings
+        ui_state["num_epochs"] = int(ui_state.get("num_epochs", 70))
+        ui_state["batch_size"] = int(ui_state.get("batch_size", 1))
+        ui_state["learning_rate"] = float(ui_state.get("learning_rate", 3e-5))
+        ui_state["save_iterations"] = int(ui_state.get("save_iterations", 500))
+        return ui_state
     def update_captioning_buttons_start(self):
         """Return individual button values instead of a dictionary"""
         return (
                 return gr.update(value=repo_id, error=None)
             # Connect events
+            # Save state when model type changes
             model_type.change(
+                fn=lambda v: self.update_ui_state(model_type=v),
+                inputs=[model_type],
+                outputs=[] # No UI update needed
+            ).then(
                 fn=update_model_info,
                 inputs=[model_type],
                 outputs=[model_info, num_epochs, batch_size, learning_rate, save_iterations]
             )
+            # the following change listeners are used for UI persistence
+            lora_rank.change(
+                fn=lambda v: self.update_ui_state(lora_rank=v),
+                inputs=[lora_rank],
+                outputs=[]
+            )
+            lora_alpha.change(
+                fn=lambda v: self.update_ui_state(lora_alpha=v),
+                inputs=[lora_alpha],
+                outputs=[]
+            )
+            num_epochs.change(
+                fn=lambda v: self.update_ui_state(num_epochs=v),
+                inputs=[num_epochs],
+                outputs=[]
+            )
+            batch_size.change(
+                fn=lambda v: self.update_ui_state(batch_size=v),
+                inputs=[batch_size],
+                outputs=[]
+            )
+            learning_rate.change(
+                fn=lambda v: self.update_ui_state(learning_rate=v),
+                inputs=[learning_rate],
+                outputs=[]
+            )
+            save_iterations.change(
+                fn=lambda v: self.update_ui_state(save_iterations=v),
+                inputs=[save_iterations],
+                outputs=[]
+            )
             async def on_import_success(enable_splitting, enable_automatic_content_captioning, prompt_prefix):
                 videos = self.list_unprocessed_videos()
                 # If scene detection isn't already running and there are videos to process,
                 fn=self.list_training_files_to_caption,
                 outputs=[training_dataset]
             )
+            # Save state when training preset changes
             training_preset.change(
+                fn=lambda v: self.update_ui_state(training_preset=v),
+                inputs=[training_preset],
+                outputs=[] # No UI update needed
+            ).then(
                 fn=self.update_training_params,
                 inputs=[training_preset],
                 outputs=[
                 ]
             )
+            # Add this new method to get initial button states:
+            def get_initial_button_states(self):
+                """Get the initial states for training buttons based on recovery status"""
+                recovery_result = self.trainer.recover_interrupted_training()
+                ui_updates = recovery_result.get("ui_updates", {})
+                # Return button states in the correct order
+                return (
+                    gr.Button(**ui_updates.get("start_btn", {"interactive": True, "variant": "primary"})),
+                    gr.Button(**ui_updates.get("stop_btn", {"interactive": False, "variant": "secondary"})),
+                    gr.Button(**ui_updates.get("pause_resume_btn", {"interactive": False, "variant": "secondary"}))
+                )
+            def initialize_ui_from_state(self):
+                """Initialize UI components from saved state"""
+                ui_state = self.load_ui_values()
+                # Return values in order matching the outputs in app.load
+                return (
+                    ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
+                    ui_state.get("model_type", list(MODEL_TYPES.keys())[0]),
+                    ui_state.get("lora_rank", "128"),
+                    ui_state.get("lora_alpha", "128"),
+                    ui_state.get("num_epochs", 70),
+                    ui_state.get("batch_size", 1),
+                    ui_state.get("learning_rate", 3e-5),
+                    ui_state.get("save_iterations", 500)
+                )
             # Auto-refresh timers
             app.load(
                 fn=lambda: (
+                    self.refresh_dataset(),
+                    *self.get_initial_button_states(),
+                    # Load saved UI state values
+                    *self.initialize_ui_from_state()
                 ),
                 outputs=[
+                    video_list, training_dataset,
+                    start_btn, stop_btn, pause_resume_btn,
+                    # Add outputs for UI fields
+                    training_preset, model_type, lora_rank, lora_alpha,
+                    num_epochs, batch_size, learning_rate, save_iterations
                 ]
             )

vms/training_log_parser.py CHANGED Viewed

@@ -34,7 +34,14 @@ class TrainingState:
     def to_dict(self) -> Dict[str, Any]:
         """Convert state to dictionary for UI updates"""
-        elapsed = str(datetime.now() - self.start_time) if self.start_time else "0:00:00"
         remaining = str(self.estimated_remaining) if self.estimated_remaining else "calculating..."
         return {
@@ -74,10 +81,11 @@ class TrainingLogParser:
             if ("Started training" in line) or ("Starting training" in line):
                 self.state.status = "training"
             if "Training steps:" in line:
                 # Set status to training if we see this
                 self.state.status = "training"
-                #print("setting status to 'training'")
                 if not self.state.start_time:
                     self.state.start_time = datetime.now()
@@ -97,36 +105,23 @@ class TrainingLogParser:
                     if match:
                         setattr(self.state, attr, float(match.group(1)))
-                # Calculate time estimates based on total elapsed time
-                now = datetime.now()
-                if self.state.start_time and self.state.current_step > 0:
-                    # Calculate elapsed time and average time per step
-                    elapsed_seconds = (now - self.state.start_time).total_seconds()
-                    avg_time_per_step = elapsed_seconds / self.state.current_step
-                    # Calculate remaining time
-                    remaining_steps = self.state.total_steps - self.state.current_step
-                    estimated_remaining_seconds = avg_time_per_step * remaining_steps
-                    # Format as days, hours, minutes, seconds
-                    days = int(estimated_remaining_seconds // (24 * 3600))
-                    hours = int((estimated_remaining_seconds % (24 * 3600)) // 3600)
-                    minutes = int((estimated_remaining_seconds % 3600) // 60)
-                    seconds = int(estimated_remaining_seconds % 60)
-                    # Create formatted timedelta
-                    if days > 0:
-                        formatted_time = f"{days}d {hours}h {minutes}m {seconds}s"
-                    elif hours > 0:
-                        formatted_time = f"{hours}h {minutes}m {seconds}s"
-                    elif minutes > 0:
-                        formatted_time = f"{minutes}m {seconds}s"
-                    else:
-                        formatted_time = f"{seconds}s"
-                    self.state.estimated_remaining = formatted_time
-                    self.state.last_step_time = now
                 logger.info(f"Updated training state: step={self.state.current_step}/{self.state.total_steps}, loss={self.state.step_loss}")
                 return self.state.to_dict()
@@ -162,12 +157,16 @@ class TrainingLogParser:
             # Completion states
             if "Training completed successfully" in line:
-                self.state.status = "completed"
                 logger.info("Training completed")
                 return self.state.to_dict()
             if any(x in line for x in ["Training process stopped", "Training stopped"]):
-                self.state.status = "stopped"
                 logger.info("Training stopped")
                 return self.state.to_dict()

     def to_dict(self) -> Dict[str, Any]:
         """Convert state to dictionary for UI updates"""
+        # Calculate elapsed time only if training is active and we have a start time
+        if self.start_time and self.status in ["training", "initializing"]:
+            elapsed = str(datetime.now() - self.start_time)
+        else:
+            # Use the last known elapsed time or show 0
+            elapsed = "0:00:00" if not self.last_step_time else str(self.last_step_time - self.start_time if self.start_time else "0:00:00")
+        # Use precomputed remaining time from logs if available
         remaining = str(self.estimated_remaining) if self.estimated_remaining else "calculating..."
         return {
             if ("Started training" in line) or ("Starting training" in line):
                 self.state.status = "training"
+            # Check for "Training steps:" which contains the progress information
             if "Training steps:" in line:
                 # Set status to training if we see this
                 self.state.status = "training"
                 if not self.state.start_time:
                     self.state.start_time = datetime.now()
                     if match:
                         setattr(self.state, attr, float(match.group(1)))
+                # Extract time remaining directly from the log
+                # Format: [MM:SS<M:SS:SS, SS.SSs/it]
+                time_remaining_match = re.search(r"<(\d+:\d+:\d+)", line)
+                if time_remaining_match:
+                    remaining_str = time_remaining_match.group(1)
+                    # Store the string directly - no need to parse it
+                    self.state.estimated_remaining = remaining_str
+                # If no direct time estimate, look for hour:min format
+                if not time_remaining_match:
+                    hour_min_match = re.search(r"<(\d+h\s*\d+m)", line)
+                    if hour_min_match:
+                        self.state.estimated_remaining = hour_min_match.group(1)
+                # Update last processing time
+                self.state.last_step_time = datetime.now()
                 logger.info(f"Updated training state: step={self.state.current_step}/{self.state.total_steps}, loss={self.state.step_loss}")
                 return self.state.to_dict()
             # Completion states
             if "Training completed successfully" in line:
+                self.status = "completed"
+                # Store final elapsed time
+                self.last_step_time = datetime.now()
                 logger.info("Training completed")
                 return self.state.to_dict()
             if any(x in line for x in ["Training process stopped", "Training stopped"]):
+                self.status = "stopped"
+                # Store final elapsed time
+                self.last_step_time = datetime.now()
                 logger.info("Training stopped")
                 return self.state.to_dict()

vms/training_service.py CHANGED Viewed

@@ -38,7 +38,7 @@ class TrainingService:
         self.setup_logging()
         logger.info("Training service initialized")
     def setup_logging(self):
         """Set up logging with proper handler management"""
         global logger
@@ -96,16 +96,58 @@ class TrainingService:
         if self.file_handler:
             self.file_handler.close()
     def save_session(self, params: Dict) -> None:
         """Save training session parameters"""
         session_data = {
             "timestamp": datetime.now().isoformat(),
             "params": params,
-            "status": self.get_status()
         }
         with open(self.session_file, 'w') as f:
             json.dump(session_data, f, indent=2)
     def load_session(self) -> Optional[Dict]:
         """Load saved training session"""
         if self.session_file.exists():
@@ -225,6 +267,7 @@ class TrainingService:
         save_iterations: int,
         repo_id: str,
         preset_name: str,
     ) -> Tuple[str, str]:
         """Start training with finetrainers"""
@@ -295,6 +338,11 @@ class TrainingService:
             config.lr = float(learning_rate)
             config.checkpointing_steps = int(save_iterations)
             # Common settings for both models
             config.mixed_precision = "bf16"
             config.seed = 42
@@ -477,10 +525,146 @@ class TrainingService:
         try:
             with open(self.pid_file, 'r') as f:
                 pid = int(f.read().strip())
-            return psutil.pid_exists(pid)
         except:
             return False
     def clear_training_data(self) -> str:
         """Clear all training data"""
         if self.is_training_running():

         self.setup_logging()
         logger.info("Training service initialized")
     def setup_logging(self):
         """Set up logging with proper handler management"""
         global logger
         if self.file_handler:
             self.file_handler.close()
+    def save_ui_state(self, values: Dict[str, Any]) -> None:
+        """Save current UI state to file"""
+        ui_state_file = OUTPUT_PATH / "ui_state.json"
+        try:
+            with open(ui_state_file, 'w') as f:
+                json.dump(values, f, indent=2)
+            logger.debug(f"UI state saved: {values}")
+        except Exception as e:
+            logger.error(f"Error saving UI state: {str(e)}")
+    def load_ui_state(self) -> Dict[str, Any]:
+        """Load saved UI state"""
+        ui_state_file = OUTPUT_PATH / "ui_state.json"
+        default_state = {
+            "model_type": list(MODEL_TYPES.keys())[0],
+            "lora_rank": "128",
+            "lora_alpha": "128",
+            "num_epochs": 70,
+            "batch_size": 1,
+            "learning_rate": 3e-5,
+            "save_iterations": 500,
+            "training_preset": list(TRAINING_PRESETS.keys())[0]
+        }
+        if not ui_state_file.exists():
+            return default_state
+        try:
+            with open(ui_state_file, 'r') as f:
+                saved_state = json.load(f)
+                # Make sure we have all keys (in case structure changed)
+                merged_state = default_state.copy()
+                merged_state.update(saved_state)
+                return merged_state
+        except Exception as e:
+            logger.error(f"Error loading UI state: {str(e)}")
+            return default_state
+    # Modify save_session to also store the UI state at training start
     def save_session(self, params: Dict) -> None:
         """Save training session parameters"""
         session_data = {
             "timestamp": datetime.now().isoformat(),
             "params": params,
+            "status": self.get_status(),
+            # Add UI state at the time training started
+            "initial_ui_state": self.load_ui_state()
         }
         with open(self.session_file, 'w') as f:
             json.dump(session_data, f, indent=2)
     def load_session(self) -> Optional[Dict]:
         """Load saved training session"""
         if self.session_file.exists():
         save_iterations: int,
         repo_id: str,
         preset_name: str,
+        resume_from_checkpoint: Optional[str] = None,
     ) -> Tuple[str, str]:
         """Start training with finetrainers"""
             config.lr = float(learning_rate)
             config.checkpointing_steps = int(save_iterations)
+            # Update with resume_from_checkpoint if provided
+            if resume_from_checkpoint:
+                config.resume_from_checkpoint = resume_from_checkpoint
+                self.append_log(f"Resuming from checkpoint: {resume_from_checkpoint}")
             # Common settings for both models
             config.mixed_precision = "bf16"
             config.seed = 42
         try:
             with open(self.pid_file, 'r') as f:
                 pid = int(f.read().strip())
+            # Check if process exists AND is a Python process running train.py
+            if psutil.pid_exists(pid):
+                try:
+                    process = psutil.Process(pid)
+                    cmdline = process.cmdline()
+                    # Check if it's a Python process running train.py
+                    return any('train.py' in cmd for cmd in cmdline)
+                except (psutil.NoSuchProcess, psutil.AccessDenied):
+                    return False
+            return False
         except:
             return False
+    def recover_interrupted_training(self) -> Dict[str, Any]:
+        """Attempt to recover interrupted training
+        Returns:
+            Dict with recovery status and UI updates
+        """
+        status = self.get_status()
+        ui_updates = {}
+        # If status indicates training but process isn't running, try to recover
+        if status.get('status') == 'training' and not self.is_training_running():
+            logger.info("Detected interrupted training session, attempting to recover...")
+            # Get the latest checkpoint
+            last_session = self.load_session()
+            if not last_session:
+                logger.warning("No session data found for recovery")
+                # Set buttons for no active training
+                ui_updates = {
+                    "start_btn": {"interactive": True, "variant": "primary"},
+                    "stop_btn": {"interactive": False, "variant": "secondary"},
+                    "pause_resume_btn": {"interactive": False, "variant": "secondary"}
+                }
+                return {"status": "error", "message": "No session data found", "ui_updates": ui_updates}
+            # Find the latest checkpoint
+            checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
+            if not checkpoints:
+                logger.warning("No checkpoints found for recovery")
+                # Set buttons for no active training
+                ui_updates = {
+                    "start_btn": {"interactive": True, "variant": "primary"},
+                    "stop_btn": {"interactive": False, "variant": "secondary"},
+                    "pause_resume_btn": {"interactive": False, "variant": "secondary"}
+                }
+                return {"status": "error", "message": "No checkpoints found", "ui_updates": ui_updates}
+            latest_checkpoint = max(checkpoints, key=os.path.getmtime)
+            checkpoint_step = int(latest_checkpoint.name.split("-")[1])
+            logger.info(f"Found checkpoint at step {checkpoint_step}, attempting to resume")
+            # Extract parameters from the saved session (not current UI state)
+            # This ensures we use the original training parameters
+            params = last_session.get('params', {})
+            initial_ui_state = last_session.get('initial_ui_state', {})
+            # Add UI updates to restore the training parameters in the UI
+            # This shows the user what values are being used for the resumed training
+            ui_updates.update({
+                "model_type": gr.update(value=params.get('model_type', list(MODEL_TYPES.keys())[0])),
+                "lora_rank": gr.update(value=params.get('lora_rank', "128")),
+                "lora_alpha": gr.update(value=params.get('lora_alpha', "128")),
+                "num_epochs": gr.update(value=params.get('num_epochs', 70)),
+                "batch_size": gr.update(value=params.get('batch_size', 1)),
+                "learning_rate": gr.update(value=params.get('learning_rate', 3e-5)),
+                "save_iterations": gr.update(value=params.get('save_iterations', 500)),
+                "training_preset": gr.update(value=params.get('preset_name', list(TRAINING_PRESETS.keys())[0]))
+            })
+            # Attempt to resume training using the ORIGINAL parameters
+            try:
+                # Extract required parameters from the session
+                model_type = params.get('model_type')
+                lora_rank = params.get('lora_rank')
+                lora_alpha = params.get('lora_alpha')
+                num_epochs = params.get('num_epochs')
+                batch_size = params.get('batch_size')
+                learning_rate = params.get('learning_rate')
+                save_iterations = params.get('save_iterations')
+                repo_id = params.get('repo_id')
+                preset_name = params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
+                # Attempt to resume training
+                result = self.start_training(
+                    model_type=model_type,
+                    lora_rank=lora_rank,
+                    lora_alpha=lora_alpha,
+                    num_epochs=num_epochs,
+                    batch_size=batch_size,
+                    learning_rate=learning_rate,
+                    save_iterations=save_iterations,
+                    repo_id=repo_id,
+                    preset_name=preset_name,
+                    resume_from_checkpoint=str(latest_checkpoint)
+                )
+                # Set buttons for active training
+                ui_updates.update({
+                    "start_btn": {"interactive": False, "variant": "secondary"},
+                    "stop_btn": {"interactive": True, "variant": "stop"},
+                    "pause_resume_btn": {"interactive": True, "variant": "secondary"}
+                })
+                return {
+                    "status": "recovered",
+                    "message": f"Training resumed from checkpoint {checkpoint_step}",
+                    "result": result,
+                    "ui_updates": ui_updates
+                }
+            except Exception as e:
+                logger.error(f"Failed to resume training: {str(e)}")
+                # Set buttons for no active training
+                ui_updates.update({
+                    "start_btn": {"interactive": True, "variant": "primary"},
+                    "stop_btn": {"interactive": False, "variant": "secondary"},
+                    "pause_resume_btn": {"interactive": False, "variant": "secondary"}
+                })
+                return {"status": "error", "message": f"Failed to resume: {str(e)}", "ui_updates": ui_updates}
+        elif self.is_training_running():
+            # Process is still running, set buttons accordingly
+            ui_updates = {
+                "start_btn": {"interactive": False, "variant": "secondary"},
+                "stop_btn": {"interactive": True, "variant": "stop"},
+                "pause_resume_btn": {"interactive": True, "variant": "secondary"}
+            }
+            return {"status": "running", "message": "Training process is running", "ui_updates": ui_updates}
+        else:
+            # No training process, set buttons to default state
+            ui_updates = {
+                "start_btn": {"interactive": True, "variant": "primary"},
+                "stop_btn": {"interactive": False, "variant": "secondary"},
+                "pause_resume_btn": {"interactive": False, "variant": "secondary"}
+            }
+            return {"status": "idle", "message": "No training in progress", "ui_updates": ui_updates}
     def clear_training_data(self) -> str:
         """Clear all training data"""
         if self.is_training_running():