Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 6

Commit

38cfbff

1 Parent(s): 29d6f3c

working to improve log reporting

Browse files

Files changed (4) hide show

vms/services/trainer.py +0 -1
vms/tabs/train_tab.py +36 -8
vms/ui/video_trainer_ui.py +15 -4
vms/utils/training_log_parser.py +133 -14

vms/services/trainer.py CHANGED Viewed

@@ -834,7 +834,6 @@ class TrainingService:
             params = last_session.get('params', {})
             # Map internal model type back to display name for UI
-            # This is the key fix for the "ltx_video" vs "LTX-Video (LoRA)" mismatch
             model_type_internal = params.get('model_type')
             model_type_display = model_type_internal

             params = last_session.get('params', {})
             # Map internal model type back to display name for UI
             model_type_internal = params.get('model_type')
             model_type_display = model_type_internal

vms/tabs/train_tab.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Train tab for Video Model Studio UI
 """
 import gradio as gr
@@ -126,7 +126,7 @@ class TrainTab(BaseTab):
                             visible=False
                         )
-                        # Add delete checkpoints button - THIS IS THE KEY FIX
                         self.components["delete_checkpoints_btn"] = gr.Button(
                             "Delete All Checkpoints",
                             variant="stop",
@@ -140,6 +140,15 @@ class TrainTab(BaseTab):
                                 interactive=False,
                                 lines=4
                             )
                             with gr.Accordion("See training logs"):
                                 self.components["log_box"] = gr.TextArea(
                                     label="Finetrainers output (see HF Space logs for more details)",
@@ -288,7 +297,8 @@ class TrainTab(BaseTab):
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
-                self.components["pause_resume_btn"]
             ]
         )
@@ -299,7 +309,8 @@ class TrainTab(BaseTab):
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
-                self.components["pause_resume_btn"]
             ]
         )
@@ -310,7 +321,8 @@ class TrainTab(BaseTab):
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
-                self.components["pause_resume_btn"]
             ]
         )
@@ -325,7 +337,8 @@ class TrainTab(BaseTab):
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
-                self.components["delete_checkpoints_btn"]
             ]
         )
@@ -555,6 +568,12 @@ class TrainTab(BaseTab):
         updates["status_box"] = "\n".join(status_text)
         # Update button states
         updates["start_btn"] = gr.Button(
             "Start training",
@@ -638,6 +657,10 @@ class TrainTab(BaseTab):
         elif "stopped" in state["message"].lower():
             state["status"] = "stopped"
         return (state["status"], state["message"], logs)
     def get_latest_status_message_logs_and_button_labels(self) -> Tuple:
@@ -649,8 +672,13 @@ class TrainTab(BaseTab):
         button_updates = self.update_training_buttons(status, has_checkpoints).values()
-        # Return in order expected by timer
-        return (message, logs, *button_updates)
     def update_training_buttons(self, status: str, has_checkpoints: bool = None) -> Dict:
         """Update training control buttons based on state"""

 """
+Train tab for Video Model Studio UI with improved task progress display
 """
 import gradio as gr
                             visible=False
                         )
+                        # Add delete checkpoints button
                         self.components["delete_checkpoints_btn"] = gr.Button(
                             "Delete All Checkpoints",
                             variant="stop",
                                 interactive=False,
                                 lines=4
                             )
+                            # Add new component for current task progress
+                            self.components["current_task_box"] = gr.Textbox(
+                                label="Current Task Progress",
+                                interactive=False,
+                                lines=3,
+                                elem_id="current_task_display"
+                            )
                             with gr.Accordion("See training logs"):
                                 self.components["log_box"] = gr.TextArea(
                                     label="Finetrainers output (see HF Space logs for more details)",
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
+                self.components["pause_resume_btn"],
+                self.components["current_task_box"]  # Include new component
             ]
         )
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
+                self.components["pause_resume_btn"],
+                self.components["current_task_box"]  # Include new component
             ]
         )
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
+                self.components["pause_resume_btn"],
+                self.components["current_task_box"]  # Include new component
             ]
         )
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
+                self.components["delete_checkpoints_btn"],
+                self.components["current_task_box"]  # Include new component
             ]
         )
         updates["status_box"] = "\n".join(status_text)
+        # Add current task information to the dedicated box
+        if training_state.get("current_task"):
+            updates["current_task_box"] = training_state["current_task"]
+        else:
+            updates["current_task_box"] = "No active task" if training_state["status"] != "training" else "Waiting for task information..."
         # Update button states
         updates["start_btn"] = gr.Button(
             "Start training",
         elif "stopped" in state["message"].lower():
             state["status"] = "stopped"
+        # Add the current task info if available
+        if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
+            state["current_task"] = self.app.log_parser.get_current_task_display()
         return (state["status"], state["message"], logs)
     def get_latest_status_message_logs_and_button_labels(self) -> Tuple:
         button_updates = self.update_training_buttons(status, has_checkpoints).values()
+        # Get current task if available
+        current_task = ""
+        if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
+            current_task = self.app.log_parser.get_current_task_display()
+        # Return in order expected by timer (added current_task)
+        return (message, logs, *button_updates, current_task)
     def update_training_buttons(self, status: str, has_checkpoints: bool = None) -> Dict:
         """Update training control buttons based on state"""

vms/ui/video_trainer_ui.py CHANGED Viewed

@@ -89,13 +89,14 @@ class VideoTrainerUI:
                     self.tabs["train_tab"].components["pause_resume_btn"],
                     self.tabs["train_tab"].components["training_preset"],
                     self.tabs["train_tab"].components["model_type"],
-                    self.tabs["train_tab"].components["training_type"],  # Add the new training_type component to outputs
                     self.tabs["train_tab"].components["lora_rank"],
                     self.tabs["train_tab"].components["lora_alpha"],
                     self.tabs["train_tab"].components["num_epochs"],
                     self.tabs["train_tab"].components["batch_size"],
                     self.tabs["train_tab"].components["learning_rate"],
-                    self.tabs["train_tab"].components["save_iterations"]
                 ]
             )
@@ -114,6 +115,10 @@ class VideoTrainerUI:
             self.tabs["train_tab"].components["stop_btn"]
         ]
         # Add delete_checkpoints_btn only if it exists
         if "delete_checkpoints_btn" in self.tabs["train_tab"].components:
             outputs.append(self.tabs["train_tab"].components["delete_checkpoints_btn"])
@@ -237,6 +242,11 @@ class VideoTrainerUI:
         learning_rate_val = float(ui_state.get("learning_rate", 3e-5))
         save_iterations_val = int(ui_state.get("save_iterations", 500))
         # Return all values in the exact order expected by outputs
         return (
             video_list,
@@ -252,7 +262,8 @@ class VideoTrainerUI:
             num_epochs_val,
             batch_size_val,
             learning_rate_val,
-            save_iterations_val
         )
     def initialize_ui_from_state(self):
@@ -293,7 +304,7 @@ class VideoTrainerUI:
         ui_state["save_iterations"] = int(ui_state.get("save_iterations", 500))
         return ui_state
     # Add this new method to get initial button states:
     def get_initial_button_states(self):
         """Get the initial states for training buttons based on recovery status"""

                     self.tabs["train_tab"].components["pause_resume_btn"],
                     self.tabs["train_tab"].components["training_preset"],
                     self.tabs["train_tab"].components["model_type"],
+                    self.tabs["train_tab"].components["training_type"],
                     self.tabs["train_tab"].components["lora_rank"],
                     self.tabs["train_tab"].components["lora_alpha"],
                     self.tabs["train_tab"].components["num_epochs"],
                     self.tabs["train_tab"].components["batch_size"],
                     self.tabs["train_tab"].components["learning_rate"],
+                    self.tabs["train_tab"].components["save_iterations"],
+                    self.tabs["train_tab"].components["current_task_box"]  # Add new component
                 ]
             )
             self.tabs["train_tab"].components["stop_btn"]
         ]
+        # Add current_task_box component
+        if "current_task_box" in self.tabs["train_tab"].components:
+            outputs.append(self.tabs["train_tab"].components["current_task_box"])
         # Add delete_checkpoints_btn only if it exists
         if "delete_checkpoints_btn" in self.tabs["train_tab"].components:
             outputs.append(self.tabs["train_tab"].components["delete_checkpoints_btn"])
         learning_rate_val = float(ui_state.get("learning_rate", 3e-5))
         save_iterations_val = int(ui_state.get("save_iterations", 500))
+        # Initial current task value
+        current_task_val = ""
+        if hasattr(self, 'log_parser') and self.log_parser:
+            current_task_val = self.log_parser.get_current_task_display()
         # Return all values in the exact order expected by outputs
         return (
             video_list,
             num_epochs_val,
             batch_size_val,
             learning_rate_val,
+            save_iterations_val,
+            current_task_val  # Add current task value
         )
     def initialize_ui_from_state(self):
         ui_state["save_iterations"] = int(ui_state.get("save_iterations", 500))
         return ui_state
     # Add this new method to get initial button states:
     def get_initial_button_states(self):
         """Get the initial states for training buttons based on recovery status"""

vms/utils/training_log_parser.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 import logging
 from dataclasses import dataclass
-from typing import Optional, Dict, Any
 from datetime import datetime, timedelta
 logger = logging.getLogger(__name__)
@@ -25,6 +25,22 @@ class TrainingState:
     error_message: Optional[str] = None
     initialization_stage: str = ""
     download_progress: float = 0.0
     def calculate_progress(self) -> float:
         """Calculate overall progress as percentage"""
@@ -44,7 +60,7 @@ class TrainingState:
         # Use precomputed remaining time from logs if available
         remaining = str(self.estimated_remaining) if self.estimated_remaining else "calculating..."
-        return {
             "status": self.status,
             "progress": f"{self.calculate_progress():.1f}%",
             "current_step": self.current_step,
@@ -61,6 +77,96 @@ class TrainingState:
             "error_message": self.error_message,
             "download_progress": self.download_progress
         }
 class TrainingLogParser:
     """Parser for training logs with state management"""
@@ -68,12 +174,30 @@ class TrainingLogParser:
     def __init__(self):
         self.state = TrainingState()
         self._last_update_time = None
     def parse_line(self, line: str) -> Optional[Dict[str, Any]]:
         """Parse a single log line and update state"""
         try:
-            # For debugging
-            #logger.info(f"Parsing line: {line[:100]}...")
             # Training step progress line example:
             # Training steps:   1%|▏         | 1/70 [00:14<16:11, 14.08s/it, grad_norm=0.00789, step_loss=0.555, lr=3e-7]
@@ -157,16 +281,16 @@ class TrainingLogParser:
             # Completion states
             if "Training completed successfully" in line:
-                self.status = "completed"
                 # Store final elapsed time
-                self.last_step_time = datetime.now()
                 logger.info("Training completed")
                 return self.state.to_dict()
             if any(x in line for x in ["Training process stopped", "Training stopped"]):
-                self.status = "stopped"
                 # Store final elapsed time
-                self.last_step_time = datetime.now()
                 logger.info("Training stopped")
                 return self.state.to_dict()
@@ -179,9 +303,4 @@ class TrainingLogParser:
         except Exception as e:
             logger.error(f"Error parsing line: {str(e)}")
-        return None
-    def reset(self):
-        """Reset parser state"""
-        self.state = TrainingState()
-        self._last_update_time = None

 import re
 import logging
 from dataclasses import dataclass
+from typing import Optional, Dict, Any, List
 from datetime import datetime, timedelta
 logger = logging.getLogger(__name__)
     error_message: Optional[str] = None
     initialization_stage: str = ""
     download_progress: float = 0.0
+    # New fields for current task tracking
+    current_task: str = ""
+    current_task_progress: str = ""
+    task_progress_percentage: float = 0.0
+    task_items_processed: int = 0
+    task_total_items: int = 0
+    task_time_remaining: str = ""
+    task_speed: str = ""
+    # Store recent progress lines for task display
+    recent_progress_lines: List[str] = None
+    def __post_init__(self):
+        if self.recent_progress_lines is None:
+            self.recent_progress_lines = []
     def calculate_progress(self) -> float:
         """Calculate overall progress as percentage"""
         # Use precomputed remaining time from logs if available
         remaining = str(self.estimated_remaining) if self.estimated_remaining else "calculating..."
+        result = {
             "status": self.status,
             "progress": f"{self.calculate_progress():.1f}%",
             "current_step": self.current_step,
             "error_message": self.error_message,
             "download_progress": self.download_progress
         }
+        # Add current task information
+        result["current_task"] = self.get_task_display()
+        return result
+    def get_task_display(self) -> str:
+        """Generate a formatted display of the current task"""
+        if not self.recent_progress_lines:
+            if self.status == "training":
+                return "Training in progress..."
+            return ""
+        # Get the most recent progress line
+        latest_line = self.recent_progress_lines[-1]
+        # For downloading shards or loading checkpoint shards
+        if "Downloading shards" in latest_line or "Loading checkpoint shards" in latest_line:
+            # Extract just the progress bar part
+            match = re.search(r'(\d+%\|[▏▎▍▌▋▊▉█\s]+\|)', latest_line)
+            if match:
+                progress_bar = match.group(1)
+                # Extract the remaining information
+                time_match = re.search(r'\[(\d+:\d+<\d+:\d+,\s+[\d.]+s/it)', latest_line)
+                time_info = time_match.group(1) if time_match else ""
+                task_type = "Downloading shards" if "Downloading shards" in latest_line else "Loading checkpoint shards"
+                return f"{task_type}:\n{progress_bar}\n{time_info}"
+        # For "Rank 0" progress (typically training steps)
+        elif "Rank 0:" in latest_line:
+            match = re.search(r'Rank 0:\s+(\d+%\|[▏▎▍▌▋▊▉█\s]+\|)', latest_line)
+            if match:
+                progress_bar = match.group(1)
+                # Extract step information
+                step_match = re.search(r'\|\s+(\d+/\d+)', latest_line)
+                step_info = step_match.group(1) if step_match else ""
+                # Extract time information
+                time_match = re.search(r'\[(\d+:\d+<\d+:\d+,\s+[\d.]+s/it)', latest_line)
+                time_info = time_match.group(1) if time_match else ""
+                return f"Training iteration:\n{progress_bar} {step_info}\n{time_info}"
+        # For Filling buffer progress
+        elif "Filling buffer" in latest_line:
+            match = re.search(r'(\d+%\|[▏▎▍▌▋▊▉█\s]+\|)', latest_line)
+            if match:
+                progress_bar = match.group(1)
+                # Extract step information
+                step_match = re.search(r'\|\s+(\d+/\d+)', latest_line)
+                step_info = step_match.group(1) if step_match else ""
+                # Extract time information
+                time_match = re.search(r'\[(\d+:\d+<\d+:\d+,\s+[\d.]+s/it)', latest_line)
+                time_info = time_match.group(1) if time_match else ""
+                return f"Filling buffer from data iterator:\n{progress_bar} {step_info}\n{time_info}"
+        # For other progress lines
+        elif "%" in latest_line and "|" in latest_line:
+            # Generic progress bar pattern
+            match = re.search(r'(\d+%\|[▏▎▍▌▋▊▉█\s]+\|)', latest_line)
+            if match:
+                progress_bar = match.group(1)
+                # Try to extract step information
+                step_match = re.search(r'\|\s+(\d+/\d+)', latest_line)
+                step_info = step_match.group(1) if step_match else ""
+                # Try to extract time information
+                time_match = re.search(r'\[(\d+:\d+<\d+:\d+,\s+[\d.]+s/it)', latest_line)
+                time_info = time_match.group(1) if time_match else ""
+                task_prefix = "Processing:"
+                # Try to determine task type
+                if "Training" in latest_line:
+                    task_prefix = "Training:"
+                elif "Precomputing" in latest_line:
+                    task_prefix = "Precomputing:"
+                return f"{task_prefix}\n{progress_bar} {step_info}\n{time_info}"
+        # If we couldn't parse it properly, just return the line
+        return latest_line.strip()
 class TrainingLogParser:
     """Parser for training logs with state management"""
     def __init__(self):
         self.state = TrainingState()
         self._last_update_time = None
+        # Maximum number of recent progress lines to store
+        self.max_recent_lines = 5
+    def reset(self):
+        """Reset parser state"""
+        self.state = TrainingState()
+        self._last_update_time = None
+    def get_current_task_display(self) -> str:
+        """Get the formatted current task display"""
+        return self.state.get_task_display()
     def parse_line(self, line: str) -> Optional[Dict[str, Any]]:
         """Parse a single log line and update state"""
         try:
+            # Check if this is a progress line
+            if any(pattern in line for pattern in ["Downloading shards:", "Loading checkpoint shards:", "Rank 0:", "Filling buffer", "|"]) and "%" in line:
+                # Add to recent progress lines, maintaining order and max length
+                self.state.recent_progress_lines.append(line)
+                if len(self.state.recent_progress_lines) > self.max_recent_lines:
+                    self.state.recent_progress_lines.pop(0)
+                # Return updated state
+                return self.state.to_dict()
             # Training step progress line example:
             # Training steps:   1%|▏         | 1/70 [00:14<16:11, 14.08s/it, grad_norm=0.00789, step_loss=0.555, lr=3e-7]
             # Completion states
             if "Training completed successfully" in line:
+                self.state.status = "completed"
                 # Store final elapsed time
+                self.state.last_step_time = datetime.now()
                 logger.info("Training completed")
                 return self.state.to_dict()
             if any(x in line for x in ["Training process stopped", "Training stopped"]):
+                self.state.status = "stopped"
                 # Store final elapsed time
+                self.state.last_step_time = datetime.now()
                 logger.info("Training stopped")
                 return self.state.to_dict()
         except Exception as e:
             logger.error(f"Error parsing line: {str(e)}")
+        return None