Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 14

Commit

a3e57a3

1 Parent(s): cb66746

debugging checkpoint restoration

Browse files

Files changed (6) hide show

vms/ui/app_ui.py +29 -17
vms/ui/project/services/previewing.py +3 -1
vms/ui/project/services/training.py +6 -4
vms/ui/project/tabs/manage_tab.py +137 -2
vms/ui/project/tabs/preview_tab.py +2 -1
vms/ui/project/tabs/train_tab.py +147 -58

vms/ui/app_ui.py CHANGED Viewed

@@ -214,8 +214,9 @@ class AppUI:
                 outputs=[
                     self.project_tabs["caption_tab"].components["training_dataset"],
                     self.project_tabs["train_tab"].components["start_btn"],
                     self.project_tabs["train_tab"].components["stop_btn"],
-                    self.project_tabs["train_tab"].components["pause_resume_btn"],
                     self.project_tabs["train_tab"].components["training_preset"],
                     self.project_tabs["train_tab"].components["model_type"],
                     self.project_tabs["train_tab"].components["model_version"],
@@ -240,7 +241,7 @@ class AppUI:
         # Status update timer for text components (every 1 second)
         status_timer = gr.Timer(value=1)
         status_timer.tick(
-            fn=self.project_tabs["train_tab"].get_status_updates,  # Use a new function that returns appropriate updates
             outputs=[
                 self.project_tabs["train_tab"].components["status_box"],
                 self.project_tabs["train_tab"].components["log_box"],
@@ -252,20 +253,23 @@ class AppUI:
         button_timer = gr.Timer(value=1)
         button_outputs = [
             self.project_tabs["train_tab"].components["start_btn"],
-            self.project_tabs["train_tab"].components["stop_btn"]
         ]
         # Add delete_checkpoints_btn or pause_resume_btn as the third button
         if "delete_checkpoints_btn" in self.project_tabs["train_tab"].components:
             button_outputs.append(self.project_tabs["train_tab"].components["delete_checkpoints_btn"])
         elif "pause_resume_btn" in self.project_tabs["train_tab"].components:
             button_outputs.append(self.project_tabs["train_tab"].components["pause_resume_btn"])
-        button_timer.tick(
-            fn=self.project_tabs["train_tab"].get_button_updates,  # Use a new function for button-specific updates
-            outputs=button_outputs
-        )
         # Dataset refresh timer (every 5 seconds)
         dataset_timer = gr.Timer(value=5)
         dataset_timer.tick(
@@ -293,9 +297,10 @@ class AppUI:
         # Get button states based on recovery status
         button_states = self.get_initial_button_states()
         start_btn = button_states[0]
-        stop_btn = button_states[1]
-        delete_checkpoints_btn = button_states[2]  # This replaces pause_resume_btn in the response tuple
         # Get UI form values - possibly from the recovery
         if self.recovery_status in ["recovered", "ready_to_recover", "running"] and "ui_updates" in self.state["recovery_result"]:
             recovery_ui = self.state["recovery_result"]["ui_updates"]
@@ -467,6 +472,7 @@ class AppUI:
         return (
             training_dataset,
             start_btn,
             stop_btn,
             delete_checkpoints_btn,
             training_preset,
@@ -543,7 +549,8 @@ class AppUI:
         ui_updates = recovery_result.get("ui_updates", {})
         # Check for checkpoints to determine start button text
-        has_checkpoints = len(list(OUTPUT_PATH.glob("checkpoint-*"))) > 0
         # Default button states if recovery didn't provide any
         if not ui_updates or not ui_updates.get("start_btn"):
@@ -551,27 +558,32 @@ class AppUI:
             if is_training:
                 # Active training detected
-                start_btn_props = {"interactive": False, "variant": "secondary", "value": "Continue Training" if has_checkpoints else "Start Training"}
                 stop_btn_props = {"interactive": True, "variant": "primary", "value": "Stop at Last Checkpoint"}
                 delete_btn_props = {"interactive": False, "variant": "stop", "value": "Delete All Checkpoints"}
             else:
                 # No active training
-                start_btn_props = {"interactive": True, "variant": "primary", "value": "Continue Training" if has_checkpoints else "Start Training"}
                 stop_btn_props = {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"}
                 delete_btn_props = {"interactive": has_checkpoints, "variant": "stop", "value": "Delete All Checkpoints"}
         else:
-            # Use button states from recovery
-            start_btn_props = ui_updates.get("start_btn", {"interactive": True, "variant": "primary", "value": "Start Training"})
             stop_btn_props = ui_updates.get("stop_btn", {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"})
             delete_btn_props = ui_updates.get("delete_checkpoints_btn", {"interactive": has_checkpoints, "variant": "stop", "value": "Delete All Checkpoints"})
         # Return button states in the correct order
         return (
             gr.Button(**start_btn_props),
             gr.Button(**stop_btn_props),
             gr.Button(**delete_btn_props)
         )
     def update_titles(self) -> Tuple[Any]:
         """Update all dynamic titles with current counts

                 outputs=[
                     self.project_tabs["caption_tab"].components["training_dataset"],
                     self.project_tabs["train_tab"].components["start_btn"],
+                    self.project_tabs["train_tab"].components["resume_btn"],
                     self.project_tabs["train_tab"].components["stop_btn"],
+                    self.project_tabs["train_tab"].components["delete_checkpoints_btn"],
                     self.project_tabs["train_tab"].components["training_preset"],
                     self.project_tabs["train_tab"].components["model_type"],
                     self.project_tabs["train_tab"].components["model_version"],
         # Status update timer for text components (every 1 second)
         status_timer = gr.Timer(value=1)
         status_timer.tick(
+            fn=self.project_tabs["train_tab"].get_status_updates,
             outputs=[
                 self.project_tabs["train_tab"].components["status_box"],
                 self.project_tabs["train_tab"].components["log_box"],
         button_timer = gr.Timer(value=1)
         button_outputs = [
             self.project_tabs["train_tab"].components["start_btn"],
+            self.project_tabs["train_tab"].components["resume_btn"],
+            self.project_tabs["train_tab"].components["stop_btn"],
+            self.project_tabs["train_tab"].components["delete_checkpoints_btn"]
         ]
+        button_timer.tick(
+            fn=self.project_tabs["train_tab"].get_button_updates,
+            outputs=button_outputs
+        )
         # Add delete_checkpoints_btn or pause_resume_btn as the third button
         if "delete_checkpoints_btn" in self.project_tabs["train_tab"].components:
             button_outputs.append(self.project_tabs["train_tab"].components["delete_checkpoints_btn"])
         elif "pause_resume_btn" in self.project_tabs["train_tab"].components:
             button_outputs.append(self.project_tabs["train_tab"].components["pause_resume_btn"])
         # Dataset refresh timer (every 5 seconds)
         dataset_timer = gr.Timer(value=5)
         dataset_timer.tick(
         # Get button states based on recovery status
         button_states = self.get_initial_button_states()
         start_btn = button_states[0]
+        resume_btn = button_states[1]
+        stop_btn = button_states[2]
+        delete_checkpoints_btn = button_states[3]
         # Get UI form values - possibly from the recovery
         if self.recovery_status in ["recovered", "ready_to_recover", "running"] and "ui_updates" in self.state["recovery_result"]:
             recovery_ui = self.state["recovery_result"]["ui_updates"]
         return (
             training_dataset,
             start_btn,
+            resume_btn,
             stop_btn,
             delete_checkpoints_btn,
             training_preset,
         ui_updates = recovery_result.get("ui_updates", {})
         # Check for checkpoints to determine start button text
+        checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
+        has_checkpoints = len(checkpoints) > 0
         # Default button states if recovery didn't provide any
         if not ui_updates or not ui_updates.get("start_btn"):
             if is_training:
                 # Active training detected
+                start_btn_props = {"interactive": False, "variant": "secondary", "value": "Start new training"}
+                resume_btn_props = {"interactive": False, "variant": "secondary", "value": "Start from latest checkpoint"}
                 stop_btn_props = {"interactive": True, "variant": "primary", "value": "Stop at Last Checkpoint"}
                 delete_btn_props = {"interactive": False, "variant": "stop", "value": "Delete All Checkpoints"}
             else:
                 # No active training
+                start_btn_props = {"interactive": True, "variant": "primary", "value": "Start new training"}
+                resume_btn_props = {"interactive": has_checkpoints, "variant": "primary", "value": "Start from latest checkpoint"}
                 stop_btn_props = {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"}
                 delete_btn_props = {"interactive": has_checkpoints, "variant": "stop", "value": "Delete All Checkpoints"}
         else:
+            # Use button states from recovery, adding the new resume button
+            start_btn_props = ui_updates.get("start_btn", {"interactive": True, "variant": "primary", "value": "Start new training"})
+            resume_btn_props = {"interactive": has_checkpoints and not self.training.is_training_running(),
+                            "variant": "primary", "value": "Start from latest checkpoint"}
             stop_btn_props = ui_updates.get("stop_btn", {"interactive": False, "variant": "secondary", "value": "Stop at Last Checkpoint"})
             delete_btn_props = ui_updates.get("delete_checkpoints_btn", {"interactive": has_checkpoints, "variant": "stop", "value": "Delete All Checkpoints"})
         # Return button states in the correct order
         return (
             gr.Button(**start_btn_props),
+            gr.Button(**resume_btn_props),  # Add the new resume button
             gr.Button(**stop_btn_props),
             gr.Button(**delete_btn_props)
         )
     def update_titles(self) -> Tuple[Any]:
         """Update all dynamic titles with current counts

vms/ui/project/services/previewing.py CHANGED Viewed

@@ -36,7 +36,9 @@ class PreviewingService:
                 return str(lora_path)
             # If not found in the expected location, try to find in checkpoints
-            checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
             if not checkpoints:
                 return None

                 return str(lora_path)
             # If not found in the expected location, try to find in checkpoints
+            checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
+            has_checkpoints = len(checkpoints) > 0
             if not checkpoints:
                 return None

vms/ui/project/services/training.py CHANGED Viewed

@@ -1042,7 +1042,7 @@ class TrainingService:
         ui_updates = {}
         # Check for any checkpoints, even if status doesn't indicate training
-        checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
         has_checkpoints = len(checkpoints) > 0
         # If status indicates training but process isn't running, or if we have checkpoints
@@ -1078,6 +1078,7 @@ class TrainingService:
                     }
                     logger.info("Created default session from UI state for recovery")
                 else:
                     # Set buttons for no active training
                     ui_updates = {
                         "start_btn": {"interactive": True, "variant": "primary", "value": "Start Training"},
@@ -1092,8 +1093,9 @@ class TrainingService:
             checkpoint_step = 0
             if has_checkpoints:
-                latest_checkpoint = max(checkpoints, key=os.path.getmtime)
-                checkpoint_step = int(latest_checkpoint.name.split("-")[1])
                 logger.info(f"Found checkpoint at step {checkpoint_step}")
             else:
                 logger.warning("No checkpoints found for recovery")
@@ -1226,7 +1228,7 @@ class TrainingService:
         try:
             # Find all checkpoint directories
-            checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
             if not checkpoints:
                 return "No checkpoints found to delete."

         ui_updates = {}
         # Check for any checkpoints, even if status doesn't indicate training
+        checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
         has_checkpoints = len(checkpoints) > 0
         # If status indicates training but process isn't running, or if we have checkpoints
                     }
                     logger.info("Created default session from UI state for recovery")
                 else:
+                    logger.warning(f"No checkpoints found for recovery")
                     # Set buttons for no active training
                     ui_updates = {
                         "start_btn": {"interactive": True, "variant": "primary", "value": "Start Training"},
             checkpoint_step = 0
             if has_checkpoints:
+                # Find the latest checkpoint by step number
+                latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("_")[-1]))
+                checkpoint_step = int(latest_checkpoint.name.split("_")[-1])
                 logger.info(f"Found checkpoint at step {checkpoint_step}")
             else:
                 logger.warning("No checkpoints found for recovery")
         try:
             # Find all checkpoint directories
+            checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
             if not checkpoints:
                 return "No checkpoints found to delete."

vms/ui/project/tabs/manage_tab.py CHANGED Viewed

@@ -65,11 +65,43 @@ class ManageTab(BaseTab):
             with gr.Row():
                 with gr.Column():
-                    gr.Markdown("## Delete your model")
-                    gr.Markdown("If something went wrong, you can trigger a full reset (model shutdown + data destruction).")
                     gr.Markdown("Make sure you have made a backup first.")
                     gr.Markdown("If you are deleting because of a bug, remember you can use the Developer Mode on HF to inspect the working directory (in /data or .data)")
             with gr.Row():
                 self.components["global_stop_btn"] = gr.Button(
                     "Stop everything and delete my data",
@@ -103,6 +135,24 @@ class ManageTab(BaseTab):
             outputs=[self.components["download_model_btn"]]
         )
         # Global stop button
         self.components["global_stop_btn"].click(
             fn=self.handle_global_stop,
@@ -151,6 +201,91 @@ class ManageTab(BaseTab):
             return f"Successfully uploaded model to {repo_id}"
         else:
             return f"Failed to upload model to {repo_id}"
     def handle_global_stop(self):
         """Handle the global stop button click"""

             with gr.Row():
                 with gr.Column():
+                    gr.Markdown("## Delete your data")
                     gr.Markdown("Make sure you have made a backup first.")
                     gr.Markdown("If you are deleting because of a bug, remember you can use the Developer Mode on HF to inspect the working directory (in /data or .data)")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Delete specific data")
+                    gr.Markdown("You can selectively delete either the dataset and/or the last model data.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    self.components["delete_dataset_btn"] = gr.Button(
+                        "Delete dataset (images, video, captions)",
+                        variant="secondary"
+                    )
+                    self.components["delete_dataset_status"] = gr.Textbox(
+                        label="Delete Dataset Status",
+                        interactive=False,
+                        visible=False
+                    )
+                with gr.Column(scale=1):
+                    self.components["delete_model_btn"] = gr.Button(
+                        "Delete model (checkpoints, weights, config)",
+                        variant="secondary"
+                    )
+                    self.components["delete_model_status"] = gr.Textbox(
+                        label="Delete Model Status",
+                        interactive=False,
+                        visible=False
+                    )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Delete everything")
+                    gr.Markdown("This will delete both the dataset (all images, videos and captions) AND the latest model (weights, checkpoints, settings). So use with care!")
             with gr.Row():
                 self.components["global_stop_btn"] = gr.Button(
                     "Stop everything and delete my data",
             outputs=[self.components["download_model_btn"]]
         )
+        # New delete dataset button
+        self.components["delete_dataset_btn"].click(
+            fn=self.delete_dataset,
+            outputs=[
+                self.components["delete_dataset_status"],
+                self.app.tabs["caption_tab"].components["training_dataset"]
+            ]
+        )
+        # New delete model button
+        self.components["delete_model_btn"].click(
+            fn=self.delete_model,
+            outputs=[
+                self.components["delete_model_status"],
+                self.app.tabs["train_tab"].components["status_box"]
+            ]
+        )
         # Global stop button
         self.components["global_stop_btn"].click(
             fn=self.handle_global_stop,
             return f"Successfully uploaded model to {repo_id}"
         else:
             return f"Failed to upload model to {repo_id}"
+    def delete_dataset(self):
+        """Delete dataset files (images, videos, captions)"""
+        status_messages = {}
+        try:
+            # Stop captioning if running
+            if self.app.captioning:
+                self.app.captioning.stop_captioning()
+                status_messages["captioning"] = "Captioning stopped"
+            # Stop scene detection if running
+            if self.app.splitting.is_processing():
+                self.app.splitting.processing = False
+                status_messages["splitting"] = "Scene detection stopped"
+            # Clear dataset directories
+            for path in [VIDEOS_TO_SPLIT_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH, TRAINING_PATH]:
+                if path.exists():
+                    try:
+                        shutil.rmtree(path)
+                        path.mkdir(parents=True, exist_ok=True)
+                    except Exception as e:
+                        status_messages[f"clear_{path.name}"] = f"Error clearing {path.name}: {str(e)}"
+                    else:
+                        status_messages[f"clear_{path.name}"] = f"Cleared {path.name}"
+            # Reset any relevant persistent state
+            self.app.tabs["caption_tab"]._should_stop_captioning = True
+            self.app.splitting.processing = False
+            # Format response
+            details = "\n".join(f"{k}: {v}" for k, v in status_messages.items())
+            message = f"Dataset deleted successfully\n\nDetails:\n{details}"
+            # Get fresh lists after cleanup
+            clips = self.app.tabs["caption_tab"].list_training_files_to_caption()
+            return gr.update(value=message, visible=True), clips
+        except Exception as e:
+            error_message = f"Error deleting dataset: {str(e)}\n\nDetails:\n{status_messages}"
+            return gr.update(value=error_message, visible=True), self.app.tabs["caption_tab"].list_training_files_to_caption()
+    def delete_model(self):
+        """Delete model files (checkpoints, weights, configuration)"""
+        status_messages = {}
+        try:
+            # Stop training if running
+            if self.app.training.is_training_running():
+                training_result = self.app.training.stop_training()
+                status_messages["training"] = training_result["status"]
+            # Clear model output directory
+            if OUTPUT_PATH.exists():
+                try:
+                    shutil.rmtree(OUTPUT_PATH)
+                    OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+                except Exception as e:
+                    status_messages[f"clear_{OUTPUT_PATH.name}"] = f"Error clearing {OUTPUT_PATH.name}: {str(e)}"
+                else:
+                    status_messages[f"clear_{OUTPUT_PATH.name}"] = f"Cleared {OUTPUT_PATH.name}"
+            # Properly close logging before clearing log file
+            if self.app.training.file_handler:
+                self.app.training.file_handler.close()
+                logger.removeHandler(self.app.training.file_handler)
+                self.app.training.file_handler = None
+            if LOG_FILE_PATH.exists():
+                LOG_FILE_PATH.unlink()
+            # Reset training UI state
+            self.app.training.setup_logging()
+            # Format response
+            details = "\n".join(f"{k}: {v}" for k, v in status_messages.items())
+            message = f"Model deleted successfully\n\nDetails:\n{details}"
+            return gr.update(value=message, visible=True), "Model files have been deleted"
+        except Exception as e:
+            error_message = f"Error deleting model: {str(e)}\n\nDetails:\n{status_messages}"
+            return gr.update(value=error_message, visible=True), f"Error deleting model: {str(e)}"
     def handle_global_stop(self):
         """Handle the global stop button click"""

vms/ui/project/tabs/preview_tab.py CHANGED Viewed

@@ -219,7 +219,8 @@ class PreviewTab(BaseTab):
             return True
         # If not found in the expected location, try to find in checkpoints
-        checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
         if not checkpoints:
             return False

             return True
         # If not found in the expected location, try to find in checkpoints
+        checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
+        has_checkpoints = len(checkpoints) > 0
         if not checkpoints:
             return False

vms/ui/project/tabs/train_tab.py CHANGED Viewed

@@ -6,6 +6,7 @@ import gradio as gr
 import logging
 import os
 import json
 from typing import Dict, Any, List, Optional, Tuple
 from pathlib import Path
@@ -177,39 +178,58 @@ class TrainTab(BaseTab):
                             precision=0,
                             info="Number of warmup steps (typically 20-40% of total training steps). This helps reducing the impact of early training examples as well as giving time to optimizers to compute accurate statistics."
                         )
-                with gr.Column():
-                    with gr.Row():
-                        # Check for existing checkpoints to determine button text
-                        has_checkpoints = len(list(OUTPUT_PATH.glob("checkpoint-*"))) > 0
-                        start_text = "Continue Training" if has_checkpoints else "Start Training"
-                        self.components["start_btn"] = gr.Button(
-                            start_text,
-                            variant="primary",
-                            interactive=not ASK_USER_TO_DUPLICATE_SPACE
-                        )
-                        # Just use stop and pause buttons for now to ensure compatibility
-                        self.components["stop_btn"] = gr.Button(
-                            "Stop at Last Checkpoint",
-                            variant="primary",
-                            interactive=False
-                        )
-                        self.components["pause_resume_btn"] = gr.Button(
-                            "Resume Training",
-                            variant="secondary",
-                            interactive=False,
-                            visible=False
-                        )
-                        # Add delete checkpoints button
-                        self.components["delete_checkpoints_btn"] = gr.Button(
-                            "Delete All Checkpoints",
-                            variant="stop",
-                            interactive=True
-                        )
                     with gr.Row():
                         with gr.Column():
                             self.components["status_box"] = gr.Textbox(
@@ -226,12 +246,12 @@ class TrainTab(BaseTab):
                                 elem_id="current_task_display"
                             )
-                            with gr.Accordion("See training logs"):
                                 self.components["log_box"] = gr.TextArea(
-                                    label="Finetrainers output (see HF Space logs for more details)",
                                     interactive=False,
-                                    lines=40,
-                                    max_lines=200,
                                     autoscroll=True
                                 )
@@ -268,6 +288,55 @@ class TrainTab(BaseTab):
         self.app.update_ui_state(model_type=model_type, model_version=model_version)
         return None
     def connect_events(self) -> None:
         """Connect event handlers to UI components"""
         # Model type change event - Update model version dropdown choices
@@ -396,11 +465,11 @@ class TrainTab(BaseTab):
         # Training control events
         self.components["start_btn"].click(
-            fn=self.handle_training_start,
             inputs=[
                 self.components["training_preset"],
                 self.components["model_type"],
-                self.components["model_version"],  # Add model_version to the inputs
                 self.components["training_type"],
                 self.components["lora_rank"],
                 self.components["lora_alpha"],
@@ -416,6 +485,28 @@ class TrainTab(BaseTab):
             ]
         )
         # Use simplified event handlers for pause/resume and stop
         third_btn = self.components["delete_checkpoints_btn"] if "delete_checkpoints_btn" in self.components else self.components["pause_resume_btn"]
@@ -500,7 +591,8 @@ class TrainTab(BaseTab):
             self.app.log_parser = TrainingLogParser()
         # Check for latest checkpoint
-        checkpoints = list(OUTPUT_PATH.glob("checkpoint-*"))
         resume_from = None
         if checkpoints:
@@ -863,43 +955,40 @@ class TrainTab(BaseTab):
         status, _, _ = self.get_latest_status_message_and_logs()
         # Add checkpoints detection
-        has_checkpoints = len(list(OUTPUT_PATH.glob("checkpoint-*"))) > 0
         is_training = status in ["training", "initializing"]
         is_completed = status in ["completed", "error", "stopped"]
-        start_text = "Continue Training" if has_checkpoints else "Start Training"
         # Create button updates
         start_btn = gr.Button(
-            value=start_text,
             interactive=not is_training,
             variant="primary" if not is_training else "secondary"
         )
         stop_btn = gr.Button(
             value="Stop at Last Checkpoint",
             interactive=is_training,
             variant="primary" if is_training else "secondary"
         )
-        # Add delete_checkpoints_btn or pause_resume_btn
-        if "delete_checkpoints_btn" in self.components:
-            third_btn = gr.Button(
-                "Delete All Checkpoints",
-                interactive=has_checkpoints and not is_training,
-                variant="stop"
-            )
-        else:
-            third_btn = gr.Button(
-                "Resume Training" if status == "paused" else "Pause Training",
-                interactive=(is_training or status == "paused") and not is_completed,
-                variant="secondary",
-                visible=False
-            )
-        return start_btn, stop_btn, third_btn
     def update_training_ui(self, training_state: Dict[str, Any]):
         """Update UI components based on training state"""
         updates = {}

 import logging
 import os
 import json
+import shutil
 from typing import Dict, Any, List, Optional, Tuple
 from pathlib import Path
                             precision=0,
                             info="Number of warmup steps (typically 20-40% of total training steps). This helps reducing the impact of early training examples as well as giving time to optimizers to compute accurate statistics."
                         )
+                with gr.Row():
+                    with gr.Column():
+                        # Add description of the training buttons
+                        self.components["training_buttons_info"] = gr.Markdown("""
+                        ## Training Options
+                        - **Start new training**: Begins training from scratch (clears previous checkpoints)
+                        - **Start from latest checkpoint**: Continues training from the most recent checkpoint
+                        """)
+                        with gr.Row():
+                            # Check for existing checkpoints to determine button text
+                            checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
+                            has_checkpoints = len(checkpoints) > 0
+                            # Rename "Start Training" to "Start new training"
+                            self.components["start_btn"] = gr.Button(
+                                "Start new training",
+                                variant="primary",
+                                interactive=not ASK_USER_TO_DUPLICATE_SPACE
+                            )
+                            # Add new button for continuing from checkpoint
+                            self.components["resume_btn"] = gr.Button(
+                                "Start from latest checkpoint",
+                                variant="primary",
+                                interactive=has_checkpoints and not ASK_USER_TO_DUPLICATE_SPACE
+                            )
+                        with gr.Row():
+                            # Just use stop and pause buttons for now to ensure compatibility
+                            self.components["stop_btn"] = gr.Button(
+                                "Stop at Last Checkpoint",
+                                variant="primary",
+                                interactive=False
+                            )
+                            self.components["pause_resume_btn"] = gr.Button(
+                                "Resume Training",
+                                variant="secondary",
+                                interactive=False,
+                                visible=False
+                            )
+                            # Add delete checkpoints button
+                            self.components["delete_checkpoints_btn"] = gr.Button(
+                                "Delete All Checkpoints",
+                                variant="stop",
+                                interactive=has_checkpoints
+                            )
+                with gr.Column():
                     with gr.Row():
                         with gr.Column():
                             self.components["status_box"] = gr.Textbox(
                                 elem_id="current_task_display"
                             )
+                            with gr.Accordion("Finetrainers output (or see app logs for more details)"):
                                 self.components["log_box"] = gr.TextArea(
+                                    #label="",
                                     interactive=False,
+                                    lines=60,
+                                    max_lines=600,
                                     autoscroll=True
                                 )
         self.app.update_ui_state(model_type=model_type, model_version=model_version)
         return None
+    def handle_new_training_start(
+        self, preset, model_type, model_version, training_type,
+        lora_rank, lora_alpha, train_steps, batch_size, learning_rate,
+        save_iterations, repo_id, progress=gr.Progress()
+    ):
+        """Handle new training start with checkpoint cleanup"""
+        # Clear output directory to start fresh
+        # Delete all checkpoint directories
+        for checkpoint in OUTPUT_PATH.glob("finetrainers_step_*"):
+            if checkpoint.is_dir():
+                shutil.rmtree(checkpoint)
+        # Also delete session.json which contains previous training info
+        session_file = OUTPUT_PATH / "session.json"
+        if session_file.exists():
+            session_file.unlink()
+        self.append_log("Cleared previous checkpoints for new training session")
+        # Start training normally
+        return self.handle_training_start(
+            preset, model_type, model_version, training_type,
+            lora_rank, lora_alpha, train_steps, batch_size, learning_rate,
+            save_iterations, repo_id, progress
+        )
+    def handle_resume_training(
+        self, preset, model_type, model_version, training_type,
+        lora_rank, lora_alpha, train_steps, batch_size, learning_rate,
+        save_iterations, repo_id, progress=gr.Progress()
+    ):
+        """Handle resuming training from the latest checkpoint"""
+        # Find the latest checkpoint
+        checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
+        if not checkpoints:
+            return "No checkpoints found to resume from", "Please start a new training session instead"
+        self.append_log(f"Resuming training from latest checkpoint")
+        # Start training with the checkpoint
+        return self.handle_training_start(
+            preset, model_type, model_version, training_type,
+            lora_rank, lora_alpha, train_steps, batch_size, learning_rate,
+            save_iterations, repo_id, progress,
+            resume_from_checkpoint="latest"
+        )
     def connect_events(self) -> None:
         """Connect event handlers to UI components"""
         # Model type change event - Update model version dropdown choices
         # Training control events
         self.components["start_btn"].click(
+            fn=self.handle_new_training_start,
             inputs=[
                 self.components["training_preset"],
                 self.components["model_type"],
+                self.components["model_version"],
                 self.components["training_type"],
                 self.components["lora_rank"],
                 self.components["lora_alpha"],
             ]
         )
+        self.components["resume_btn"].click(
+            fn=self.handle_resume_training,
+            inputs=[
+                self.components["training_preset"],
+                self.components["model_type"],
+                self.components["model_version"],
+                self.components["training_type"],
+                self.components["lora_rank"],
+                self.components["lora_alpha"],
+                self.components["train_steps"],
+                self.components["batch_size"],
+                self.components["learning_rate"],
+                self.components["save_iterations"],
+                self.app.tabs["manage_tab"].components["repo_id"]
+            ],
+            outputs=[
+                self.components["status_box"],
+                self.components["log_box"]
+            ]
+        )
         # Use simplified event handlers for pause/resume and stop
         third_btn = self.components["delete_checkpoints_btn"] if "delete_checkpoints_btn" in self.components else self.components["pause_resume_btn"]
             self.app.log_parser = TrainingLogParser()
         # Check for latest checkpoint
+        checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
+        has_checkpoints = len(checkpoints) > 0
         resume_from = None
         if checkpoints:
         status, _, _ = self.get_latest_status_message_and_logs()
         # Add checkpoints detection
+        checkpoints = list(OUTPUT_PATH.glob("finetrainers_step_*"))
+        has_checkpoints = len(checkpoints) > 0
         is_training = status in ["training", "initializing"]
         is_completed = status in ["completed", "error", "stopped"]
         # Create button updates
         start_btn = gr.Button(
+            value="Start new training",
             interactive=not is_training,
             variant="primary" if not is_training else "secondary"
         )
+        resume_btn = gr.Button(
+            value="Start from latest checkpoint",
+            interactive=has_checkpoints and not is_training,
+            variant="primary" if not is_training else "secondary"
+        )
         stop_btn = gr.Button(
             value="Stop at Last Checkpoint",
             interactive=is_training,
             variant="primary" if is_training else "secondary"
         )
+        # Add delete_checkpoints_btn
+        delete_checkpoints_btn = gr.Button(
+            "Delete All Checkpoints",
+            interactive=has_checkpoints and not is_training,
+            variant="stop"
+        )
+        return start_btn, resume_btn, stop_btn, delete_checkpoints_btn
     def update_training_ui(self, training_state: Dict[str, Any]):
         """Update UI components based on training state"""
         updates = {}