Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 17

Commit

0d34ea8

1 Parent(s): c5911ab

add gpu tracking

Browse files

Files changed (9) hide show

requirements.txt +4 -0
requirements_without_flash_attention.txt +4 -0
vms/config.py +3 -0
vms/ui/app_ui.py +11 -4
vms/ui/monitoring/services/gpu.py +485 -0
vms/ui/monitoring/services/monitoring.py +13 -1
vms/ui/monitoring/tabs/gpu_tab.py +370 -0
vms/ui/project/services/training.py +15 -7
vms/ui/project/tabs/train_tab.py +14 -0

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ numpy>=1.26.4
 # to quote a-r-r-o-w/finetrainers:
 # It is recommended to use Pytorch 2.5.1 or above for training. Previous versions can lead to completely black videos, OOM errors, or other issues and are not tested.
 # on some system (Python 3.13+) those do not work:
 torch==2.5.1
 torchvision==0.20.1
@@ -20,6 +21,9 @@ accelerate
 bitsandbytes
 peft>=0.12.0
 # eva-decord is missing get_batch it seems
 #eva-decord==0.6.1
 decord

 # to quote a-r-r-o-w/finetrainers:
 # It is recommended to use Pytorch 2.5.1 or above for training. Previous versions can lead to completely black videos, OOM errors, or other issues and are not tested.
 # on some system (Python 3.13+) those do not work:
 torch==2.5.1
 torchvision==0.20.1
 bitsandbytes
 peft>=0.12.0
+# For GPU monitoring of NVIDIA chipsets
+pynvml
 # eva-decord is missing get_batch it seems
 #eva-decord==0.6.1
 decord

requirements_without_flash_attention.txt CHANGED Viewed

@@ -21,6 +21,10 @@ accelerate
 bitsandbytes
 peft>=0.12.0
 # eva-decord is missing get_batch it seems
 eva-decord==0.6.1
 # decord

 bitsandbytes
 peft>=0.12.0
+# For GPU monitoring of NVIDIA chipsets
+you probably won't be able to install that on macOS
+# pynvml
 # eva-decord is missing get_batch it seems
 eva-decord==0.6.1
 # decord

vms/config.py CHANGED Viewed

@@ -150,6 +150,9 @@ DEFAULT_NB_TRAINING_STEPS = 1000
 # For this value, it is recommended to use about 20 to 40% of the number of training steps
 DEFAULT_NB_LR_WARMUP_STEPS = math.ceil(0.20 * DEFAULT_NB_TRAINING_STEPS)  # 20% of training steps
 # For validation
 DEFAULT_VALIDATION_NB_STEPS = 50
 DEFAULT_VALIDATION_HEIGHT = 512

 # For this value, it is recommended to use about 20 to 40% of the number of training steps
 DEFAULT_NB_LR_WARMUP_STEPS = math.ceil(0.20 * DEFAULT_NB_TRAINING_STEPS)  # 20% of training steps
+# Whether to automatically restart a training job after a server reboot or not
+DEFAULT_AUTO_RESUME = False
 # For validation
 DEFAULT_VALIDATION_NB_STEPS = 50
 DEFAULT_VALIDATION_HEIGHT = 512

vms/ui/app_ui.py CHANGED Viewed

@@ -19,7 +19,8 @@ from vms.config import (
     DEFAULT_MAX_GPUS,
     DEFAULT_PRECOMPUTATION_ITEMS,
     DEFAULT_NB_TRAINING_STEPS,
-    DEFAULT_NB_LR_WARMUP_STEPS
 )
 from vms.utils import (
     get_recommended_precomputation_items,
@@ -40,7 +41,7 @@ from vms.ui.monitoring.services import (
 )
 from vms.ui.monitoring.tabs import (
-    GeneralTab
 )
 logger = logging.getLogger(__name__)
@@ -183,6 +184,8 @@ class AppUI:
                                 # Initialize monitoring tab objects
                                 self.monitor_tabs["general_tab"] = GeneralTab(self)
                                 # Create tab UI components for monitoring
                                 for tab_id, tab_obj in self.monitor_tabs.items():
                                     tab_obj.create(monitoring_tabs)
@@ -230,7 +233,8 @@ class AppUI:
                     self.project_tabs["train_tab"].components["current_task_box"],
                     self.project_tabs["train_tab"].components["num_gpus"],
                     self.project_tabs["train_tab"].components["precomputation_items"],
-                    self.project_tabs["train_tab"].components["lr_warmup_steps"]
                 ]
             )
@@ -376,6 +380,8 @@ class AppUI:
         # Get model_version value
         model_version_val = ""
         # First get the internal model type for the currently selected model
         model_internal_type = MODEL_TYPES.get(model_type_val)
         logger.info(f"Initializing model version for model_type: {model_type_val} (internal: {model_internal_type})")
@@ -480,7 +486,8 @@ class AppUI:
             current_task_val,
             num_gpus_val,
             precomputation_items_val,
-            lr_warmup_steps_val
         )
     def initialize_ui_from_state(self):

     DEFAULT_MAX_GPUS,
     DEFAULT_PRECOMPUTATION_ITEMS,
     DEFAULT_NB_TRAINING_STEPS,
+    DEFAULT_NB_LR_WARMUP_STEPS,
+    DEFAULT_AUTO_RESUME
 )
 from vms.utils import (
     get_recommended_precomputation_items,
 )
 from vms.ui.monitoring.tabs import (
+    GeneralTab, GPUTab
 )
 logger = logging.getLogger(__name__)
                                 # Initialize monitoring tab objects
                                 self.monitor_tabs["general_tab"] = GeneralTab(self)
+                                self.monitor_tabs["gpu_tab"] = GPUTab(self)
                                 # Create tab UI components for monitoring
                                 for tab_id, tab_obj in self.monitor_tabs.items():
                                     tab_obj.create(monitoring_tabs)
                     self.project_tabs["train_tab"].components["current_task_box"],
                     self.project_tabs["train_tab"].components["num_gpus"],
                     self.project_tabs["train_tab"].components["precomputation_items"],
+                    self.project_tabs["train_tab"].components["lr_warmup_steps"],
+                    self.project_tabs["train_tab"].components["auto_resume_checkbox"]
                 ]
             )
         # Get model_version value
         model_version_val = ""
+        auto_resume_val = ui_state.get("auto_resume", DEFAULT_AUTO_RESUME)
         # First get the internal model type for the currently selected model
         model_internal_type = MODEL_TYPES.get(model_type_val)
         logger.info(f"Initializing model version for model_type: {model_type_val} (internal: {model_internal_type})")
             current_task_val,
             num_gpus_val,
             precomputation_items_val,
+            lr_warmup_steps_val,
+            auto_resume_val
         )
     def initialize_ui_from_state(self):

vms/ui/monitoring/services/gpu.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""
+GPU monitoring service for Video Model Studio.
+Tracks NVIDIA GPU resources like utilization, memory, and temperature.
+"""
+import os
+import time
+import logging
+from typing import Dict, List, Any, Optional, Tuple
+from collections import deque
+from datetime import datetime
+# Force the use of the Agg backend which is thread-safe
+import matplotlib
+matplotlib.use('Agg')  # Must be before importing pyplot
+import matplotlib.pyplot as plt
+import numpy as np
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+# Optional import of pynvml
+try:
+    import pynvml
+    PYNVML_AVAILABLE = True
+except ImportError:
+    PYNVML_AVAILABLE = False
+    logger.info("pynvml not available, GPU monitoring will be limited")
+class GPUMonitoringService:
+    """Service for monitoring NVIDIA GPU resources"""
+    def __init__(self, history_minutes: int = 10, sample_interval: int = 5):
+        """Initialize the GPU monitoring service
+        Args:
+            history_minutes: How many minutes of history to keep
+            sample_interval: How many seconds between samples
+        """
+        self.history_minutes = history_minutes
+        self.sample_interval = sample_interval
+        self.max_samples = (history_minutes * 60) // sample_interval
+        # Track if the monitoring thread is running
+        self.is_running = False
+        self.thread = None
+        # Check if NVIDIA GPUs are available
+        self.has_nvidia_gpus = False
+        self.gpu_count = 0
+        self.device_info = []
+        self.history = {}
+        # Try to initialize NVML
+        self._initialize_nvml()
+        # Initialize history data structures if GPUs are available
+        if self.has_nvidia_gpus:
+            self._initialize_history()
+    def _initialize_nvml(self):
+        """Initialize NVIDIA Management Library"""
+        if not PYNVML_AVAILABLE:
+            logger.info("pynvml module not installed, GPU monitoring disabled")
+            return
+        try:
+            pynvml.nvmlInit()
+            self.gpu_count = pynvml.nvmlDeviceGetCount()
+            self.has_nvidia_gpus = self.gpu_count > 0
+            if self.has_nvidia_gpus:
+                logger.info(f"Successfully initialized NVML, found {self.gpu_count} GPU(s)")
+                # Get static information about each GPU
+                for i in range(self.gpu_count):
+                    self.device_info.append(self._get_device_info(i))
+            else:
+                logger.info("No NVIDIA GPUs found")
+        except Exception as e:
+            logger.warning(f"Failed to initialize NVML: {str(e)}")
+            self.has_nvidia_gpus = False
+    def _initialize_history(self):
+        """Initialize data structures for storing metric history"""
+        for i in range(self.gpu_count):
+            self.history[i] = {
+                'timestamps': deque(maxlen=self.max_samples),
+                'utilization': deque(maxlen=self.max_samples),
+                'memory_used': deque(maxlen=self.max_samples),
+                'memory_total': deque(maxlen=self.max_samples),
+                'memory_percent': deque(maxlen=self.max_samples),
+                'temperature': deque(maxlen=self.max_samples),
+                'power_usage': deque(maxlen=self.max_samples),
+                'power_limit': deque(maxlen=self.max_samples),
+            }
+    def _get_device_info(self, device_index: int) -> Dict[str, Any]:
+        """Get static information about a GPU device
+        Args:
+            device_index: Index of the GPU device
+        Returns:
+            Dictionary with device information
+        """
+        if not PYNVML_AVAILABLE or not self.has_nvidia_gpus:
+            return {"error": "NVIDIA GPUs not available"}
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
+            # Get device name (decode if it's bytes)
+            name = pynvml.nvmlDeviceGetName(handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            # Get device UUID
+            uuid = pynvml.nvmlDeviceGetUUID(handle)
+            if isinstance(uuid, bytes):
+                uuid = uuid.decode('utf-8')
+            # Get memory info, compute capability
+            memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            compute_capability = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            # Get power limits if available
+            try:
+                power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0  # Convert to watts
+            except pynvml.NVMLError:
+                power_limit = None
+            return {
+                'index': device_index,
+                'name': name,
+                'uuid': uuid,
+                'memory_total': memory_info.total,
+                'memory_total_gb': memory_info.total / (1024**3),  # Convert to GB
+                'compute_capability': f"{compute_capability[0]}.{compute_capability[1]}",
+                'power_limit': power_limit
+            }
+        except Exception as e:
+            logger.error(f"Error getting device info for GPU {device_index}: {str(e)}")
+            return {"error": str(e), "index": device_index}
+    def collect_gpu_metrics(self) -> List[Dict[str, Any]]:
+        """Collect current GPU metrics for all available GPUs
+        Returns:
+            List of dictionaries with current metrics for each GPU
+        """
+        if not PYNVML_AVAILABLE or not self.has_nvidia_gpus:
+            return []
+        metrics = []
+        timestamp = datetime.now()
+        for i in range(self.gpu_count):
+            try:
+                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                # Get utilization rates
+                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                # Get memory information
+                memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                # Get temperature
+                temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
+                # Get power usage if available
+                try:
+                    power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0  # Convert to watts
+                except pynvml.NVMLError:
+                    power_usage = None
+                # Get process information
+                processes = []
+                try:
+                    for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
+                        try:
+                            process_name = pynvml.nvmlSystemGetProcessName(proc.pid)
+                            if isinstance(process_name, bytes):
+                                process_name = process_name.decode('utf-8')
+                        except pynvml.NVMLError:
+                            process_name = f"Unknown (PID: {proc.pid})"
+                        processes.append({
+                            'pid': proc.pid,
+                            'name': process_name,
+                            'memory_used': proc.usedGpuMemory,
+                            'memory_used_mb': proc.usedGpuMemory / (1024**2)  # Convert to MB
+                        })
+                except pynvml.NVMLError:
+                    # Unable to get process information, continue with empty list
+                    pass
+                gpu_metrics = {
+                    'index': i,
+                    'timestamp': timestamp,
+                    'utilization_gpu': utilization.gpu,
+                    'utilization_memory': utilization.memory,
+                    'memory_total': memory_info.total,
+                    'memory_used': memory_info.used,
+                    'memory_free': memory_info.free,
+                    'memory_percent': (memory_info.used / memory_info.total) * 100,
+                    'temperature': temperature,
+                    'power_usage': power_usage,
+                    'processes': processes
+                }
+                metrics.append(gpu_metrics)
+            except Exception as e:
+                logger.error(f"Error collecting metrics for GPU {i}: {str(e)}")
+                metrics.append({
+                    'index': i,
+                    'error': str(e)
+                })
+        return metrics
+    def update_history(self):
+        """Update GPU metrics history"""
+        if not self.has_nvidia_gpus:
+            return
+        current_metrics = self.collect_gpu_metrics()
+        timestamp = datetime.now()
+        for gpu_metrics in current_metrics:
+            if 'error' in gpu_metrics:
+                continue
+            idx = gpu_metrics['index']
+            self.history[idx]['timestamps'].append(timestamp)
+            self.history[idx]['utilization'].append(gpu_metrics['utilization_gpu'])
+            self.history[idx]['memory_used'].append(gpu_metrics['memory_used'])
+            self.history[idx]['memory_total'].append(gpu_metrics['memory_total'])
+            self.history[idx]['memory_percent'].append(gpu_metrics['memory_percent'])
+            self.history[idx]['temperature'].append(gpu_metrics['temperature'])
+            if gpu_metrics['power_usage'] is not None:
+                self.history[idx]['power_usage'].append(gpu_metrics['power_usage'])
+            else:
+                self.history[idx]['power_usage'].append(0)
+            # Store power limit in history (static but kept for consistency)
+            info = self.device_info[idx]
+            if 'power_limit' in info and info['power_limit'] is not None:
+                self.history[idx]['power_limit'].append(info['power_limit'])
+            else:
+                self.history[idx]['power_limit'].append(0)
+    def start_monitoring(self):
+        """Start background thread for collecting GPU metrics"""
+        if self.is_running:
+            logger.warning("GPU monitoring thread already running")
+            return
+        if not self.has_nvidia_gpus:
+            logger.info("No NVIDIA GPUs found, not starting monitoring thread")
+            return
+        import threading
+        self.is_running = True
+        def _monitor_loop():
+            while self.is_running:
+                try:
+                    self.update_history()
+                    time.sleep(self.sample_interval)
+                except Exception as e:
+                    logger.error(f"Error in GPU monitoring thread: {str(e)}", exc_info=True)
+                    time.sleep(self.sample_interval)
+        self.thread = threading.Thread(target=_monitor_loop, daemon=True)
+        self.thread.start()
+        logger.info("GPU monitoring thread started")
+    def stop_monitoring(self):
+        """Stop the GPU monitoring thread"""
+        if not self.is_running:
+            return
+        self.is_running = False
+        if self.thread:
+            self.thread.join(timeout=1.0)
+            logger.info("GPU monitoring thread stopped")
+    def get_gpu_info(self) -> List[Dict[str, Any]]:
+        """Get information about all available GPUs
+        Returns:
+            List of dictionaries with GPU information
+        """
+        return self.device_info
+    def get_current_metrics(self) -> List[Dict[str, Any]]:
+        """Get current metrics for all GPUs
+        Returns:
+            List of dictionaries with current GPU metrics
+        """
+        return self.collect_gpu_metrics()
+    def generate_utilization_plot(self, gpu_index: int) -> plt.Figure:
+        """Generate a plot of GPU utilization over time
+        Args:
+            gpu_index: Index of the GPU to plot
+        Returns:
+            Matplotlib figure with utilization plot
+        """
+        plt.close('all')  # Close all existing figures
+        fig, ax = plt.subplots(figsize=(10, 5))
+        if not self.has_nvidia_gpus or gpu_index not in self.history:
+            ax.set_title(f"No data available for GPU {gpu_index}")
+            return fig
+        history = self.history[gpu_index]
+        if not history['timestamps']:
+            ax.set_title(f"No history data for GPU {gpu_index}")
+            return fig
+        # Convert timestamps to strings
+        x = [t.strftime('%H:%M:%S') for t in history['timestamps']]
+        # If we have many points, show fewer labels for readability
+        if len(x) > 10:
+            step = len(x) // 10
+            ax.set_xticks(range(0, len(x), step))
+            ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45)
+        # Plot utilization
+        ax.plot(x, list(history['utilization']), 'b-', label='GPU Utilization %')
+        ax.set_ylim(0, 100)
+        # Add temperature on secondary y-axis
+        ax2 = ax.twinx()
+        ax2.plot(x, list(history['temperature']), 'r-', label='Temperature °C')
+        ax2.set_ylabel('Temperature (°C)', color='r')
+        ax2.tick_params(axis='y', colors='r')
+        # Set labels and title
+        ax.set_title(f'GPU {gpu_index} Utilization Over Time')
+        ax.set_xlabel('Time')
+        ax.set_ylabel('Utilization %')
+        ax.grid(True, alpha=0.3)
+        # Add legend
+        lines, labels = ax.get_legend_handles_labels()
+        lines2, labels2 = ax2.get_legend_handles_labels()
+        ax.legend(lines + lines2, labels + labels2, loc='upper left')
+        plt.tight_layout()
+        return fig
+    def generate_memory_plot(self, gpu_index: int) -> plt.Figure:
+        """Generate a plot of GPU memory usage over time
+        Args:
+            gpu_index: Index of the GPU to plot
+        Returns:
+            Matplotlib figure with memory usage plot
+        """
+        plt.close('all')  # Close all existing figures
+        fig, ax = plt.subplots(figsize=(10, 5))
+        if not self.has_nvidia_gpus or gpu_index not in self.history:
+            ax.set_title(f"No data available for GPU {gpu_index}")
+            return fig
+        history = self.history[gpu_index]
+        if not history['timestamps']:
+            ax.set_title(f"No history data for GPU {gpu_index}")
+            return fig
+        # Convert timestamps to strings
+        x = [t.strftime('%H:%M:%S') for t in history['timestamps']]
+        # If we have many points, show fewer labels for readability
+        if len(x) > 10:
+            step = len(x) // 10
+            ax.set_xticks(range(0, len(x), step))
+            ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45)
+        # Plot memory percentage
+        ax.plot(x, list(history['memory_percent']), 'g-', label='Memory Usage %')
+        ax.set_ylim(0, 100)
+        # Add absolute memory values on secondary y-axis (convert to GB)
+        ax2 = ax.twinx()
+        memory_used_gb = [m / (1024**3) for m in history['memory_used']]
+        memory_total_gb = [m / (1024**3) for m in history['memory_total']]
+        ax2.plot(x, memory_used_gb, 'm--', label='Used (GB)')
+        ax2.set_ylabel('Memory (GB)')
+        # Set labels and title
+        ax.set_title(f'GPU {gpu_index} Memory Usage Over Time')
+        ax.set_xlabel('Time')
+        ax.set_ylabel('Usage %')
+        ax.grid(True, alpha=0.3)
+        # Add legend
+        lines, labels = ax.get_legend_handles_labels()
+        lines2, labels2 = ax2.get_legend_handles_labels()
+        ax.legend(lines + lines2, labels + labels2, loc='upper left')
+        plt.tight_layout()
+        return fig
+    def generate_power_plot(self, gpu_index: int) -> plt.Figure:
+        """Generate a plot of GPU power usage over time
+        Args:
+            gpu_index: Index of the GPU to plot
+        Returns:
+            Matplotlib figure with power usage plot
+        """
+        plt.close('all')  # Close all existing figures
+        fig, ax = plt.subplots(figsize=(10, 5))
+        if not self.has_nvidia_gpus or gpu_index not in self.history:
+            ax.set_title(f"No data available for GPU {gpu_index}")
+            return fig
+        history = self.history[gpu_index]
+        if not history['timestamps'] or not any(history['power_usage']):
+            ax.set_title(f"No power data for GPU {gpu_index}")
+            return fig
+        # Convert timestamps to strings
+        x = [t.strftime('%H:%M:%S') for t in history['timestamps']]
+        # If we have many points, show fewer labels for readability
+        if len(x) > 10:
+            step = len(x) // 10
+            ax.set_xticks(range(0, len(x), step))
+            ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45)
+        # Plot power usage
+        power_usage = list(history['power_usage'])
+        if any(power_usage):  # Only plot if we have actual power data
+            ax.plot(x, power_usage, 'b-', label='Power Usage (W)')
+            # Get power limit if available
+            power_limit = list(history['power_limit'])
+            if any(power_limit):  # Only plot if we have power limit data
+                # Show power limit as horizontal line
+                limit = max(power_limit)  # Should be constant, but take max just in case
+                if limit > 0:
+                    ax.axhline(y=limit, color='r', linestyle='--', label=f'Power Limit ({limit}W)')
+            # Set labels and title
+            ax.set_title(f'GPU {gpu_index} Power Usage Over Time')
+            ax.set_xlabel('Time')
+            ax.set_ylabel('Power (Watts)')
+            ax.grid(True, alpha=0.3)
+            ax.legend(loc='upper left')
+        else:
+            ax.set_title(f"Power data not available for GPU {gpu_index}")
+        plt.tight_layout()
+        return fig
+    def shutdown(self):
+        """Clean up resources when shutting down"""
+        self.stop_monitoring()
+        # Shutdown NVML if it was initialized
+        if PYNVML_AVAILABLE and self.has_nvidia_gpus:
+            try:
+                pynvml.nvmlShutdown()
+                logger.info("NVML shutdown complete")
+            except Exception as e:
+                logger.error(f"Error during NVML shutdown: {str(e)}")

vms/ui/monitoring/services/monitoring.py CHANGED Viewed

@@ -21,6 +21,8 @@ import matplotlib.pyplot as plt
 import numpy as np
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -51,6 +53,9 @@ class MonitoringService:
         # Per-core CPU history
         self.cpu_cores_percent = {}
         # Track if the monitoring thread is running
         self.is_running = False
         self.thread = None
@@ -124,6 +129,9 @@ class MonitoringService:
             return
         self.is_running = True
         def _monitor_loop():
             while self.is_running:
@@ -143,8 +151,12 @@ class MonitoringService:
         """Stop the monitoring thread"""
         if not self.is_running:
             return
         self.is_running = False
         if self.thread:
             self.thread.join(timeout=1.0)
             logger.info("System monitoring thread stopped")

 import numpy as np
+from vms.ui.monitoring.services.gpu import GPUMonitoringService
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
         # Per-core CPU history
         self.cpu_cores_percent = {}
+        # Initialize GPU monitoring service
+        self.gpu = GPUMonitoringService(history_minutes=history_minutes, sample_interval=sample_interval)
         # Track if the monitoring thread is running
         self.is_running = False
         self.thread = None
             return
         self.is_running = True
+        # Start GPU monitoring if available
+        self.gpu.start_monitoring()
         def _monitor_loop():
             while self.is_running:
         """Stop the monitoring thread"""
         if not self.is_running:
             return
         self.is_running = False
+        # Stop GPU monitoring
+        self.gpu.stop_monitoring()
         if self.thread:
             self.thread.join(timeout=1.0)
             logger.info("System monitoring thread stopped")

vms/ui/monitoring/tabs/gpu_tab.py ADDED Viewed

	@@ -0,0 +1,370 @@

+"""
+GPU monitoring tab for Video Model Studio UI.
+Displays detailed GPU metrics and visualizations.
+"""
+import gradio as gr
+import time
+import logging
+from pathlib import Path
+import os
+from typing import Dict, Any, List, Optional, Tuple
+from datetime import datetime, timedelta
+from vms.utils.base_tab import BaseTab
+from vms.ui.monitoring.utils import human_readable_size
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class GPUTab(BaseTab):
+    """Tab for GPU-specific monitoring and statistics"""
+    def __init__(self, app_state):
+        super().__init__(app_state)
+        self.id = "GPU_tab"
+        self.title = "GPU Stats"
+        self.refresh_interval = 5
+        self.selected_gpu = 0
+    def create(self, parent=None) -> gr.TabItem:
+        """Create the GPU tab UI components"""
+        with gr.TabItem(self.title, id=self.id) as tab:
+            with gr.Row():
+                gr.Markdown("## 🖥️ GPU Monitoring")
+            # No GPUs available message (hidden by default)
+            with gr.Row(visible=not self.app.monitoring.gpu.has_nvidia_gpus):
+                with gr.Column():
+                    gr.Markdown("### No NVIDIA GPUs detected")
+                    gr.Markdown("GPU monitoring is only available for NVIDIA GPUs. If you have NVIDIA GPUs installed, ensure the drivers are properly configured.")
+            # GPU content (only visible if GPUs are available)
+            with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus):
+                # GPU selector if multiple GPUs
+                if self.app.monitoring.gpu.gpu_count > 1:
+                    with gr.Column(scale=1):
+                        gpu_options = [f"GPU {i}" for i in range(self.app.monitoring.gpu.gpu_count)]
+                        self.components["gpu_selector"] = gr.Dropdown(
+                            choices=gpu_options,
+                            value=gpu_options[0] if gpu_options else None,
+                            label="Select GPU",
+                            interactive=True
+                        )
+                # Current metrics
+                with gr.Column(scale=3):
+                    self.components["current_metrics"] = gr.Markdown("Loading GPU metrics...")
+            # Display GPU metrics in tabs
+            with gr.Tabs(visible=self.app.monitoring.gpu.has_nvidia_gpus) as metrics_tabs:
+                with gr.Tab(label="Utilization") as util_tab:
+                    self.components["utilization_plot"] = gr.Plot()
+                with gr.Tab(label="Memory") as memory_tab:
+                    self.components["memory_plot"] = gr.Plot()
+                with gr.Tab(label="Power") as power_tab:
+                    self.components["power_plot"] = gr.Plot()
+            # Process information
+            with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus):
+                with gr.Column():
+                    gr.Markdown("### Active Processes")
+                    self.components["process_info"] = gr.Markdown("Loading process information...")
+            # GPU information summary
+            with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus):
+                with gr.Column():
+                    gr.Markdown("### GPU Information")
+                    self.components["gpu_info"] = gr.Markdown("Loading GPU information...")
+            # Toggle for enabling/disabling auto-refresh
+            with gr.Row():
+                self.components["auto_refresh"] = gr.Checkbox(
+                    label=f"Auto refresh (every {self.refresh_interval} seconds)",
+                    value=True,
+                    info="Automatically refresh GPU metrics"
+                )
+                self.components["refresh_btn"] = gr.Button("Refresh Now")
+            # Timer for auto-refresh
+            self.components["refresh_timer"] = gr.Timer(
+                value=self.refresh_interval
+            )
+        return tab
+    def connect_events(self) -> None:
+        """Connect event handlers to UI components"""
+        # GPU selector (if multiple GPUs)
+        if self.app.monitoring.gpu.gpu_count > 1 and "gpu_selector" in self.components:
+            self.components["gpu_selector"].change(
+                fn=self.update_selected_gpu,
+                inputs=[self.components["gpu_selector"]],
+                outputs=[
+                    self.components["current_metrics"],
+                    self.components["utilization_plot"],
+                    self.components["memory_plot"],
+                    self.components["power_plot"],
+                    self.components["process_info"],
+                    self.components["gpu_info"]
+                ]
+            )
+        # Manual refresh button
+        self.components["refresh_btn"].click(
+            fn=self.refresh_all,
+            outputs=[
+                self.components["current_metrics"],
+                self.components["utilization_plot"],
+                self.components["memory_plot"],
+                self.components["power_plot"],
+                self.components["process_info"],
+                self.components["gpu_info"]
+            ]
+        )
+        # Auto-refresh timer
+        self.components["refresh_timer"].tick(
+            fn=self.conditional_refresh,
+            inputs=[self.components["auto_refresh"]],
+            outputs=[
+                self.components["current_metrics"],
+                self.components["utilization_plot"],
+                self.components["memory_plot"],
+                self.components["power_plot"],
+                self.components["process_info"],
+                self.components["gpu_info"]
+            ]
+        )
+    def on_enter(self):
+        """Called when the tab is selected"""
+        # Trigger initial refresh
+        return self.refresh_all()
+    def update_selected_gpu(self, gpu_selector: str) -> Tuple:
+        """Update the selected GPU and refresh data
+        Args:
+            gpu_selector: Selected GPU string ("GPU X")
+        Returns:
+            Updated components
+        """
+        # Extract GPU index from selector string
+        try:
+            self.selected_gpu = int(gpu_selector.replace("GPU ", ""))
+        except (ValueError, AttributeError):
+            self.selected_gpu = 0
+        # Refresh all components with the new selected GPU
+        return self.refresh_all()
+    def conditional_refresh(self, auto_refresh: bool) -> Tuple:
+        """Only refresh if auto-refresh is enabled
+        Args:
+            auto_refresh: Whether auto-refresh is enabled
+        Returns:
+            Updated components or unchanged components
+        """
+        if auto_refresh:
+            return self.refresh_all()
+        # Return current values unchanged if auto-refresh is disabled
+        return (
+            self.components["current_metrics"].value,
+            self.components["utilization_plot"].value,
+            self.components["memory_plot"].value,
+            self.components["power_plot"].value,
+            self.components["process_info"].value,
+            self.components["gpu_info"].value
+        )
+    def refresh_all(self) -> Tuple:
+        """Refresh all GPU monitoring components
+        Returns:
+            Updated values for all components
+        """
+        try:
+            if not self.app.monitoring.gpu.has_nvidia_gpus:
+                return (
+                    "No NVIDIA GPUs detected",
+                    None,
+                    None,
+                    None,
+                    "No process information available",
+                    "No GPU information available"
+                )
+            # Get current metrics for the selected GPU
+            all_metrics = self.app.monitoring.gpu.get_current_metrics()
+            if not all_metrics or self.selected_gpu >= len(all_metrics):
+                return (
+                    "GPU metrics not available",
+                    None,
+                    None,
+                    None,
+                    "No process information available",
+                    "No GPU information available"
+                )
+            # Get selected GPU metrics
+            gpu_metrics = all_metrics[self.selected_gpu]
+            # Format current metrics as markdown
+            metrics_html = self.format_current_metrics(gpu_metrics)
+            # Format process information
+            process_info_html = self.format_process_info(gpu_metrics)
+            # Format GPU information
+            gpu_info = self.app.monitoring.gpu.get_gpu_info()
+            gpu_info_html = self.format_gpu_info(gpu_info[self.selected_gpu] if self.selected_gpu < len(gpu_info) else {})
+            # Generate plots
+            utilization_plot = self.app.monitoring.gpu.generate_utilization_plot(self.selected_gpu)
+            memory_plot = self.app.monitoring.gpu.generate_memory_plot(self.selected_gpu)
+            power_plot = self.app.monitoring.gpu.generate_power_plot(self.selected_gpu)
+            return (
+                metrics_html,
+                utilization_plot,
+                memory_plot,
+                power_plot,
+                process_info_html,
+                gpu_info_html
+            )
+        except Exception as e:
+            logger.error(f"Error refreshing GPU data: {str(e)}", exc_info=True)
+            error_msg = f"Error retrieving GPU data: {str(e)}"
+            return (
+                error_msg,
+                None,
+                None,
+                None,
+                error_msg,
+                error_msg
+            )
+    def format_current_metrics(self, metrics: Dict[str, Any]) -> str:
+        """Format current GPU metrics as HTML/Markdown
+        Args:
+            metrics: Current metrics dictionary
+        Returns:
+            Formatted HTML/Markdown string
+        """
+        if 'error' in metrics:
+            return f"Error retrieving GPU metrics: {metrics['error']}"
+        # Format timestamp
+        if isinstance(metrics.get('timestamp'), datetime):
+            timestamp_str = metrics['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
+        else:
+            timestamp_str = "Unknown"
+        # Style for GPU utilization
+        util_style = "color: green;"
+        if metrics.get('utilization_gpu', 0) > 90:
+            util_style = "color: red; font-weight: bold;"
+        elif metrics.get('utilization_gpu', 0) > 70:
+            util_style = "color: orange;"
+        # Style for memory usage
+        mem_style = "color: green;"
+        if metrics.get('memory_percent', 0) > 90:
+            mem_style = "color: red; font-weight: bold;"
+        elif metrics.get('memory_percent', 0) > 70:
+            mem_style = "color: orange;"
+        # Style for temperature
+        temp_style = "color: green;"
+        temp = metrics.get('temperature', 0)
+        if temp > 85:
+            temp_style = "color: red; font-weight: bold;"
+        elif temp > 75:
+            temp_style = "color: orange;"
+        # Memory usage in GB
+        memory_used_gb = metrics.get('memory_used', 0) / (1024**3)
+        memory_total_gb = metrics.get('memory_total', 0) / (1024**3)
+        # Power usage and limit
+        power_html = ""
+        if metrics.get('power_usage') is not None:
+            power_html = f"**Power Usage:** {metrics['power_usage']:.1f}W\n"
+        html = f"""
+### Current Status as of {timestamp_str}
+**GPU Utilization:** <span style="{util_style}">{metrics.get('utilization_gpu', 0):.1f}%</span>
+**Memory Usage:** <span style="{mem_style}">{metrics.get('memory_percent', 0):.1f}% ({memory_used_gb:.2f}/{memory_total_gb:.2f} GB)</span>
+**Temperature:** <span style="{temp_style}">{metrics.get('temperature', 0)}°C</span>
+{power_html}
+"""
+        return html
+    def format_process_info(self, metrics: Dict[str, Any]) -> str:
+        """Format GPU process information as HTML/Markdown
+        Args:
+            metrics: Current metrics dictionary with process information
+        Returns:
+            Formatted HTML/Markdown string
+        """
+        if 'error' in metrics:
+            return "Process information not available"
+        processes = metrics.get('processes', [])
+        if not processes:
+            return "No active processes using this GPU"
+        # Sort processes by memory usage (descending)
+        sorted_processes = sorted(processes, key=lambda p: p.get('memory_used', 0), reverse=True)
+        html = "| PID | Process Name | Memory Usage |\n"
+        html += "|-----|-------------|-------------|\n"
+        for proc in sorted_processes:
+            pid = proc.get('pid', 'Unknown')
+            name = proc.get('name', 'Unknown')
+            mem_mb = proc.get('memory_used', 0) / (1024**2)  # Convert to MB
+            html += f"| {pid} | {name} | {mem_mb:.1f} MB |\n"
+        return html
+    def format_gpu_info(self, info: Dict[str, Any]) -> str:
+        """Format GPU information as HTML/Markdown
+        Args:
+            info: GPU information dictionary
+        Returns:
+            Formatted HTML/Markdown string
+        """
+        if 'error' in info:
+            return f"GPU information not available: {info.get('error', 'Unknown error')}"
+        # Format memory in GB
+        memory_total_gb = info.get('memory_total', 0) / (1024**3)
+        html = f"""
+**Name:** {info.get('name', 'Unknown')}
+**Memory:** {memory_total_gb:.2f} GB
+**UUID:** {info.get('uuid', 'N/A')}
+**Compute Capability:** {info.get('compute_capability', 'N/A')}
+"""
+        # Add power limit if available
+        if info.get('power_limit') is not None:
+            html += f"**Power Limit:** {info['power_limit']:.1f}W\n"
+        return html

vms/ui/project/services/training.py CHANGED Viewed

@@ -38,7 +38,8 @@ from vms.config import (
     DEFAULT_MAX_GPUS,
     DEFAULT_PRECOMPUTATION_ITEMS,
     DEFAULT_NB_TRAINING_STEPS,
-    DEFAULT_NB_LR_WARMUP_STEPS
 )
 from vms.utils import (
     get_available_gpu_count,
@@ -151,7 +152,8 @@ class TrainingService:
                 "training_preset": list(TRAINING_PRESETS.keys())[0],
                 "num_gpus": DEFAULT_NUM_GPUS,
                 "precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
-                "lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS
             }
             # Copy default values first
@@ -231,7 +233,8 @@ class TrainingService:
             "training_preset": list(TRAINING_PRESETS.keys())[0],
             "num_gpus": DEFAULT_NUM_GPUS,
             "precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
-            "lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS
         }
         # Use lock for reading too to avoid reading during a write
@@ -369,6 +372,7 @@ class TrainingService:
         # Default state with all required values
         default_state = {
             "model_type": list(MODEL_TYPES.keys())[0],
             "training_type": list(TRAINING_TYPES.keys())[0],
             "lora_rank": DEFAULT_LORA_RANK_STR,
             "lora_alpha": DEFAULT_LORA_ALPHA_STR,
@@ -379,7 +383,8 @@ class TrainingService:
             "training_preset": list(TRAINING_PRESETS.keys())[0],
             "num_gpus": DEFAULT_NUM_GPUS,
             "precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
-            "lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS
         }
         # If file doesn't exist, create it with default values
@@ -1144,12 +1149,15 @@ class TrainingService:
                 "batch_size": params.get('batch_size', DEFAULT_BATCH_SIZE),
                 "learning_rate": params.get('learning_rate', DEFAULT_LEARNING_RATE),
                 "save_iterations": params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
-                "training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
             })
             # Check if we should auto-recover (immediate restart)
-            auto_recover = True  # Always auto-recover on startup
             if auto_recover:
                 try:
                     result = self.start_training(

     DEFAULT_MAX_GPUS,
     DEFAULT_PRECOMPUTATION_ITEMS,
     DEFAULT_NB_TRAINING_STEPS,
+    DEFAULT_NB_LR_WARMUP_STEPS,
+    DEFAULT_AUTO_RESUME
 )
 from vms.utils import (
     get_available_gpu_count,
                 "training_preset": list(TRAINING_PRESETS.keys())[0],
                 "num_gpus": DEFAULT_NUM_GPUS,
                 "precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
+                "lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS,
+                "auto_resume": False
             }
             # Copy default values first
             "training_preset": list(TRAINING_PRESETS.keys())[0],
             "num_gpus": DEFAULT_NUM_GPUS,
             "precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
+            "lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS,
+            "auto_resume": DEFAULT_AUTO_RESUME
         }
         # Use lock for reading too to avoid reading during a write
         # Default state with all required values
         default_state = {
             "model_type": list(MODEL_TYPES.keys())[0],
+            "model_version": "",
             "training_type": list(TRAINING_TYPES.keys())[0],
             "lora_rank": DEFAULT_LORA_RANK_STR,
             "lora_alpha": DEFAULT_LORA_ALPHA_STR,
             "training_preset": list(TRAINING_PRESETS.keys())[0],
             "num_gpus": DEFAULT_NUM_GPUS,
             "precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
+            "lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS,
+            "auto_resume": False
         }
         # If file doesn't exist, create it with default values
                 "batch_size": params.get('batch_size', DEFAULT_BATCH_SIZE),
                 "learning_rate": params.get('learning_rate', DEFAULT_LEARNING_RATE),
                 "save_iterations": params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
+                "training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0]),
+                "auto_resume_checkbox": ui_state.get("auto_resume", DEFAULT_AUTO_RESUME)
             })
             # Check if we should auto-recover (immediate restart)
+            ui_state = self.load_ui_state()
+            auto_recover = ui_state.get("auto_resume", DEFAULT_AUTO_RESUME)
+            logger.info(f"Auto-resume is {'enabled' if auto_recover else 'disabled'}")
             if auto_recover:
                 try:
                     result = self.start_training(

vms/ui/project/tabs/train_tab.py CHANGED Viewed

@@ -26,6 +26,7 @@ from vms.config import (
     DEFAULT_PRECOMPUTATION_ITEMS,
     DEFAULT_NB_TRAINING_STEPS,
     DEFAULT_NB_LR_WARMUP_STEPS,
 )
 logger = logging.getLogger(__name__)
@@ -231,6 +232,13 @@ class TrainTab(BaseTab):
                                         interactive=has_checkpoints
                                     )
                         with gr.Row():
                             with gr.Column():
                                 self.components["status_box"] = gr.Textbox(
@@ -381,6 +389,12 @@ class TrainTab(BaseTab):
             ]
         )
         # Add in the connect_events() method:
         self.components["num_gpus"].change(
             fn=lambda v: self.app.update_ui_state(num_gpus=v),

     DEFAULT_PRECOMPUTATION_ITEMS,
     DEFAULT_NB_TRAINING_STEPS,
     DEFAULT_NB_LR_WARMUP_STEPS,
+    DEFAULT_AUTO_RESUME
 )
 logger = logging.getLogger(__name__)
                                         interactive=has_checkpoints
                                     )
+                                with gr.Row():
+                                    self.components["auto_resume_checkbox"] = gr.Checkbox(
+                                        label="Automatically continue training in case of server reboot.",
+                                        value=DEFAULT_AUTO_RESUME,
+                                        info="When enabled, training will automatically resume from the latest checkpoint after app restart"
+                                    )
                         with gr.Row():
                             with gr.Column():
                                 self.components["status_box"] = gr.Textbox(
             ]
         )
+        self.components["auto_resume_checkbox"].change(
+            fn=lambda v: self.app.update_ui_state(auto_resume=v),
+            inputs=[self.components["auto_resume_checkbox"]],
+            outputs=[]
+        )
         # Add in the connect_events() method:
         self.components["num_gpus"].change(
             fn=lambda v: self.app.update_ui_state(num_gpus=v),