Spaces:
Running
Running
Commit
·
0d34ea8
1
Parent(s):
c5911ab
add gpu tracking
Browse files- requirements.txt +4 -0
- requirements_without_flash_attention.txt +4 -0
- vms/config.py +3 -0
- vms/ui/app_ui.py +11 -4
- vms/ui/monitoring/services/gpu.py +485 -0
- vms/ui/monitoring/services/monitoring.py +13 -1
- vms/ui/monitoring/tabs/gpu_tab.py +370 -0
- vms/ui/project/services/training.py +15 -7
- vms/ui/project/tabs/train_tab.py +14 -0
requirements.txt
CHANGED
@@ -2,6 +2,7 @@ numpy>=1.26.4
|
|
2 |
|
3 |
# to quote a-r-r-o-w/finetrainers:
|
4 |
# It is recommended to use Pytorch 2.5.1 or above for training. Previous versions can lead to completely black videos, OOM errors, or other issues and are not tested.
|
|
|
5 |
# on some system (Python 3.13+) those do not work:
|
6 |
torch==2.5.1
|
7 |
torchvision==0.20.1
|
@@ -20,6 +21,9 @@ accelerate
|
|
20 |
bitsandbytes
|
21 |
peft>=0.12.0
|
22 |
|
|
|
|
|
|
|
23 |
# eva-decord is missing get_batch it seems
|
24 |
#eva-decord==0.6.1
|
25 |
decord
|
|
|
2 |
|
3 |
# to quote a-r-r-o-w/finetrainers:
|
4 |
# It is recommended to use Pytorch 2.5.1 or above for training. Previous versions can lead to completely black videos, OOM errors, or other issues and are not tested.
|
5 |
+
|
6 |
# on some system (Python 3.13+) those do not work:
|
7 |
torch==2.5.1
|
8 |
torchvision==0.20.1
|
|
|
21 |
bitsandbytes
|
22 |
peft>=0.12.0
|
23 |
|
24 |
+
# For GPU monitoring of NVIDIA chipsets
|
25 |
+
pynvml
|
26 |
+
|
27 |
# eva-decord is missing get_batch it seems
|
28 |
#eva-decord==0.6.1
|
29 |
decord
|
requirements_without_flash_attention.txt
CHANGED
@@ -21,6 +21,10 @@ accelerate
|
|
21 |
bitsandbytes
|
22 |
peft>=0.12.0
|
23 |
|
|
|
|
|
|
|
|
|
24 |
# eva-decord is missing get_batch it seems
|
25 |
eva-decord==0.6.1
|
26 |
# decord
|
|
|
21 |
bitsandbytes
|
22 |
peft>=0.12.0
|
23 |
|
24 |
+
# For GPU monitoring of NVIDIA chipsets
|
25 |
+
you probably won't be able to install that on macOS
|
26 |
+
# pynvml
|
27 |
+
|
28 |
# eva-decord is missing get_batch it seems
|
29 |
eva-decord==0.6.1
|
30 |
# decord
|
vms/config.py
CHANGED
@@ -150,6 +150,9 @@ DEFAULT_NB_TRAINING_STEPS = 1000
|
|
150 |
# For this value, it is recommended to use about 20 to 40% of the number of training steps
|
151 |
DEFAULT_NB_LR_WARMUP_STEPS = math.ceil(0.20 * DEFAULT_NB_TRAINING_STEPS) # 20% of training steps
|
152 |
|
|
|
|
|
|
|
153 |
# For validation
|
154 |
DEFAULT_VALIDATION_NB_STEPS = 50
|
155 |
DEFAULT_VALIDATION_HEIGHT = 512
|
|
|
150 |
# For this value, it is recommended to use about 20 to 40% of the number of training steps
|
151 |
DEFAULT_NB_LR_WARMUP_STEPS = math.ceil(0.20 * DEFAULT_NB_TRAINING_STEPS) # 20% of training steps
|
152 |
|
153 |
+
# Whether to automatically restart a training job after a server reboot or not
|
154 |
+
DEFAULT_AUTO_RESUME = False
|
155 |
+
|
156 |
# For validation
|
157 |
DEFAULT_VALIDATION_NB_STEPS = 50
|
158 |
DEFAULT_VALIDATION_HEIGHT = 512
|
vms/ui/app_ui.py
CHANGED
@@ -19,7 +19,8 @@ from vms.config import (
|
|
19 |
DEFAULT_MAX_GPUS,
|
20 |
DEFAULT_PRECOMPUTATION_ITEMS,
|
21 |
DEFAULT_NB_TRAINING_STEPS,
|
22 |
-
DEFAULT_NB_LR_WARMUP_STEPS
|
|
|
23 |
)
|
24 |
from vms.utils import (
|
25 |
get_recommended_precomputation_items,
|
@@ -40,7 +41,7 @@ from vms.ui.monitoring.services import (
|
|
40 |
)
|
41 |
|
42 |
from vms.ui.monitoring.tabs import (
|
43 |
-
GeneralTab
|
44 |
)
|
45 |
|
46 |
logger = logging.getLogger(__name__)
|
@@ -183,6 +184,8 @@ class AppUI:
|
|
183 |
# Initialize monitoring tab objects
|
184 |
self.monitor_tabs["general_tab"] = GeneralTab(self)
|
185 |
|
|
|
|
|
186 |
# Create tab UI components for monitoring
|
187 |
for tab_id, tab_obj in self.monitor_tabs.items():
|
188 |
tab_obj.create(monitoring_tabs)
|
@@ -230,7 +233,8 @@ class AppUI:
|
|
230 |
self.project_tabs["train_tab"].components["current_task_box"],
|
231 |
self.project_tabs["train_tab"].components["num_gpus"],
|
232 |
self.project_tabs["train_tab"].components["precomputation_items"],
|
233 |
-
self.project_tabs["train_tab"].components["lr_warmup_steps"]
|
|
|
234 |
]
|
235 |
)
|
236 |
|
@@ -376,6 +380,8 @@ class AppUI:
|
|
376 |
# Get model_version value
|
377 |
model_version_val = ""
|
378 |
|
|
|
|
|
379 |
# First get the internal model type for the currently selected model
|
380 |
model_internal_type = MODEL_TYPES.get(model_type_val)
|
381 |
logger.info(f"Initializing model version for model_type: {model_type_val} (internal: {model_internal_type})")
|
@@ -480,7 +486,8 @@ class AppUI:
|
|
480 |
current_task_val,
|
481 |
num_gpus_val,
|
482 |
precomputation_items_val,
|
483 |
-
lr_warmup_steps_val
|
|
|
484 |
)
|
485 |
|
486 |
def initialize_ui_from_state(self):
|
|
|
19 |
DEFAULT_MAX_GPUS,
|
20 |
DEFAULT_PRECOMPUTATION_ITEMS,
|
21 |
DEFAULT_NB_TRAINING_STEPS,
|
22 |
+
DEFAULT_NB_LR_WARMUP_STEPS,
|
23 |
+
DEFAULT_AUTO_RESUME
|
24 |
)
|
25 |
from vms.utils import (
|
26 |
get_recommended_precomputation_items,
|
|
|
41 |
)
|
42 |
|
43 |
from vms.ui.monitoring.tabs import (
|
44 |
+
GeneralTab, GPUTab
|
45 |
)
|
46 |
|
47 |
logger = logging.getLogger(__name__)
|
|
|
184 |
# Initialize monitoring tab objects
|
185 |
self.monitor_tabs["general_tab"] = GeneralTab(self)
|
186 |
|
187 |
+
self.monitor_tabs["gpu_tab"] = GPUTab(self)
|
188 |
+
|
189 |
# Create tab UI components for monitoring
|
190 |
for tab_id, tab_obj in self.monitor_tabs.items():
|
191 |
tab_obj.create(monitoring_tabs)
|
|
|
233 |
self.project_tabs["train_tab"].components["current_task_box"],
|
234 |
self.project_tabs["train_tab"].components["num_gpus"],
|
235 |
self.project_tabs["train_tab"].components["precomputation_items"],
|
236 |
+
self.project_tabs["train_tab"].components["lr_warmup_steps"],
|
237 |
+
self.project_tabs["train_tab"].components["auto_resume_checkbox"]
|
238 |
]
|
239 |
)
|
240 |
|
|
|
380 |
# Get model_version value
|
381 |
model_version_val = ""
|
382 |
|
383 |
+
auto_resume_val = ui_state.get("auto_resume", DEFAULT_AUTO_RESUME)
|
384 |
+
|
385 |
# First get the internal model type for the currently selected model
|
386 |
model_internal_type = MODEL_TYPES.get(model_type_val)
|
387 |
logger.info(f"Initializing model version for model_type: {model_type_val} (internal: {model_internal_type})")
|
|
|
486 |
current_task_val,
|
487 |
num_gpus_val,
|
488 |
precomputation_items_val,
|
489 |
+
lr_warmup_steps_val,
|
490 |
+
auto_resume_val
|
491 |
)
|
492 |
|
493 |
def initialize_ui_from_state(self):
|
vms/ui/monitoring/services/gpu.py
ADDED
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
GPU monitoring service for Video Model Studio.
|
3 |
+
Tracks NVIDIA GPU resources like utilization, memory, and temperature.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import time
|
8 |
+
import logging
|
9 |
+
from typing import Dict, List, Any, Optional, Tuple
|
10 |
+
from collections import deque
|
11 |
+
from datetime import datetime
|
12 |
+
|
13 |
+
# Force the use of the Agg backend which is thread-safe
|
14 |
+
import matplotlib
|
15 |
+
matplotlib.use('Agg') # Must be before importing pyplot
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
import numpy as np
|
18 |
+
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
logger.setLevel(logging.INFO)
|
21 |
+
|
22 |
+
# Optional import of pynvml
|
23 |
+
try:
|
24 |
+
import pynvml
|
25 |
+
PYNVML_AVAILABLE = True
|
26 |
+
except ImportError:
|
27 |
+
PYNVML_AVAILABLE = False
|
28 |
+
logger.info("pynvml not available, GPU monitoring will be limited")
|
29 |
+
|
30 |
+
class GPUMonitoringService:
|
31 |
+
"""Service for monitoring NVIDIA GPU resources"""
|
32 |
+
|
33 |
+
def __init__(self, history_minutes: int = 10, sample_interval: int = 5):
|
34 |
+
"""Initialize the GPU monitoring service
|
35 |
+
|
36 |
+
Args:
|
37 |
+
history_minutes: How many minutes of history to keep
|
38 |
+
sample_interval: How many seconds between samples
|
39 |
+
"""
|
40 |
+
self.history_minutes = history_minutes
|
41 |
+
self.sample_interval = sample_interval
|
42 |
+
self.max_samples = (history_minutes * 60) // sample_interval
|
43 |
+
|
44 |
+
# Track if the monitoring thread is running
|
45 |
+
self.is_running = False
|
46 |
+
self.thread = None
|
47 |
+
|
48 |
+
# Check if NVIDIA GPUs are available
|
49 |
+
self.has_nvidia_gpus = False
|
50 |
+
self.gpu_count = 0
|
51 |
+
self.device_info = []
|
52 |
+
self.history = {}
|
53 |
+
|
54 |
+
# Try to initialize NVML
|
55 |
+
self._initialize_nvml()
|
56 |
+
|
57 |
+
# Initialize history data structures if GPUs are available
|
58 |
+
if self.has_nvidia_gpus:
|
59 |
+
self._initialize_history()
|
60 |
+
|
61 |
+
def _initialize_nvml(self):
|
62 |
+
"""Initialize NVIDIA Management Library"""
|
63 |
+
if not PYNVML_AVAILABLE:
|
64 |
+
logger.info("pynvml module not installed, GPU monitoring disabled")
|
65 |
+
return
|
66 |
+
|
67 |
+
try:
|
68 |
+
pynvml.nvmlInit()
|
69 |
+
self.gpu_count = pynvml.nvmlDeviceGetCount()
|
70 |
+
self.has_nvidia_gpus = self.gpu_count > 0
|
71 |
+
|
72 |
+
if self.has_nvidia_gpus:
|
73 |
+
logger.info(f"Successfully initialized NVML, found {self.gpu_count} GPU(s)")
|
74 |
+
# Get static information about each GPU
|
75 |
+
for i in range(self.gpu_count):
|
76 |
+
self.device_info.append(self._get_device_info(i))
|
77 |
+
else:
|
78 |
+
logger.info("No NVIDIA GPUs found")
|
79 |
+
|
80 |
+
except Exception as e:
|
81 |
+
logger.warning(f"Failed to initialize NVML: {str(e)}")
|
82 |
+
self.has_nvidia_gpus = False
|
83 |
+
|
84 |
+
def _initialize_history(self):
|
85 |
+
"""Initialize data structures for storing metric history"""
|
86 |
+
for i in range(self.gpu_count):
|
87 |
+
self.history[i] = {
|
88 |
+
'timestamps': deque(maxlen=self.max_samples),
|
89 |
+
'utilization': deque(maxlen=self.max_samples),
|
90 |
+
'memory_used': deque(maxlen=self.max_samples),
|
91 |
+
'memory_total': deque(maxlen=self.max_samples),
|
92 |
+
'memory_percent': deque(maxlen=self.max_samples),
|
93 |
+
'temperature': deque(maxlen=self.max_samples),
|
94 |
+
'power_usage': deque(maxlen=self.max_samples),
|
95 |
+
'power_limit': deque(maxlen=self.max_samples),
|
96 |
+
}
|
97 |
+
|
98 |
+
def _get_device_info(self, device_index: int) -> Dict[str, Any]:
|
99 |
+
"""Get static information about a GPU device
|
100 |
+
|
101 |
+
Args:
|
102 |
+
device_index: Index of the GPU device
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
Dictionary with device information
|
106 |
+
"""
|
107 |
+
if not PYNVML_AVAILABLE or not self.has_nvidia_gpus:
|
108 |
+
return {"error": "NVIDIA GPUs not available"}
|
109 |
+
|
110 |
+
try:
|
111 |
+
handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
|
112 |
+
|
113 |
+
# Get device name (decode if it's bytes)
|
114 |
+
name = pynvml.nvmlDeviceGetName(handle)
|
115 |
+
if isinstance(name, bytes):
|
116 |
+
name = name.decode('utf-8')
|
117 |
+
|
118 |
+
# Get device UUID
|
119 |
+
uuid = pynvml.nvmlDeviceGetUUID(handle)
|
120 |
+
if isinstance(uuid, bytes):
|
121 |
+
uuid = uuid.decode('utf-8')
|
122 |
+
|
123 |
+
# Get memory info, compute capability
|
124 |
+
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
125 |
+
compute_capability = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
|
126 |
+
|
127 |
+
# Get power limits if available
|
128 |
+
try:
|
129 |
+
power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0 # Convert to watts
|
130 |
+
except pynvml.NVMLError:
|
131 |
+
power_limit = None
|
132 |
+
|
133 |
+
return {
|
134 |
+
'index': device_index,
|
135 |
+
'name': name,
|
136 |
+
'uuid': uuid,
|
137 |
+
'memory_total': memory_info.total,
|
138 |
+
'memory_total_gb': memory_info.total / (1024**3), # Convert to GB
|
139 |
+
'compute_capability': f"{compute_capability[0]}.{compute_capability[1]}",
|
140 |
+
'power_limit': power_limit
|
141 |
+
}
|
142 |
+
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"Error getting device info for GPU {device_index}: {str(e)}")
|
145 |
+
return {"error": str(e), "index": device_index}
|
146 |
+
|
147 |
+
def collect_gpu_metrics(self) -> List[Dict[str, Any]]:
|
148 |
+
"""Collect current GPU metrics for all available GPUs
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
List of dictionaries with current metrics for each GPU
|
152 |
+
"""
|
153 |
+
if not PYNVML_AVAILABLE or not self.has_nvidia_gpus:
|
154 |
+
return []
|
155 |
+
|
156 |
+
metrics = []
|
157 |
+
timestamp = datetime.now()
|
158 |
+
|
159 |
+
for i in range(self.gpu_count):
|
160 |
+
try:
|
161 |
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
162 |
+
|
163 |
+
# Get utilization rates
|
164 |
+
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
165 |
+
|
166 |
+
# Get memory information
|
167 |
+
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
168 |
+
|
169 |
+
# Get temperature
|
170 |
+
temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
171 |
+
|
172 |
+
# Get power usage if available
|
173 |
+
try:
|
174 |
+
power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 # Convert to watts
|
175 |
+
except pynvml.NVMLError:
|
176 |
+
power_usage = None
|
177 |
+
|
178 |
+
# Get process information
|
179 |
+
processes = []
|
180 |
+
try:
|
181 |
+
for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
|
182 |
+
try:
|
183 |
+
process_name = pynvml.nvmlSystemGetProcessName(proc.pid)
|
184 |
+
if isinstance(process_name, bytes):
|
185 |
+
process_name = process_name.decode('utf-8')
|
186 |
+
except pynvml.NVMLError:
|
187 |
+
process_name = f"Unknown (PID: {proc.pid})"
|
188 |
+
|
189 |
+
processes.append({
|
190 |
+
'pid': proc.pid,
|
191 |
+
'name': process_name,
|
192 |
+
'memory_used': proc.usedGpuMemory,
|
193 |
+
'memory_used_mb': proc.usedGpuMemory / (1024**2) # Convert to MB
|
194 |
+
})
|
195 |
+
except pynvml.NVMLError:
|
196 |
+
# Unable to get process information, continue with empty list
|
197 |
+
pass
|
198 |
+
|
199 |
+
gpu_metrics = {
|
200 |
+
'index': i,
|
201 |
+
'timestamp': timestamp,
|
202 |
+
'utilization_gpu': utilization.gpu,
|
203 |
+
'utilization_memory': utilization.memory,
|
204 |
+
'memory_total': memory_info.total,
|
205 |
+
'memory_used': memory_info.used,
|
206 |
+
'memory_free': memory_info.free,
|
207 |
+
'memory_percent': (memory_info.used / memory_info.total) * 100,
|
208 |
+
'temperature': temperature,
|
209 |
+
'power_usage': power_usage,
|
210 |
+
'processes': processes
|
211 |
+
}
|
212 |
+
|
213 |
+
metrics.append(gpu_metrics)
|
214 |
+
|
215 |
+
except Exception as e:
|
216 |
+
logger.error(f"Error collecting metrics for GPU {i}: {str(e)}")
|
217 |
+
metrics.append({
|
218 |
+
'index': i,
|
219 |
+
'error': str(e)
|
220 |
+
})
|
221 |
+
|
222 |
+
return metrics
|
223 |
+
|
224 |
+
def update_history(self):
|
225 |
+
"""Update GPU metrics history"""
|
226 |
+
if not self.has_nvidia_gpus:
|
227 |
+
return
|
228 |
+
|
229 |
+
current_metrics = self.collect_gpu_metrics()
|
230 |
+
timestamp = datetime.now()
|
231 |
+
|
232 |
+
for gpu_metrics in current_metrics:
|
233 |
+
if 'error' in gpu_metrics:
|
234 |
+
continue
|
235 |
+
|
236 |
+
idx = gpu_metrics['index']
|
237 |
+
|
238 |
+
self.history[idx]['timestamps'].append(timestamp)
|
239 |
+
self.history[idx]['utilization'].append(gpu_metrics['utilization_gpu'])
|
240 |
+
self.history[idx]['memory_used'].append(gpu_metrics['memory_used'])
|
241 |
+
self.history[idx]['memory_total'].append(gpu_metrics['memory_total'])
|
242 |
+
self.history[idx]['memory_percent'].append(gpu_metrics['memory_percent'])
|
243 |
+
self.history[idx]['temperature'].append(gpu_metrics['temperature'])
|
244 |
+
|
245 |
+
if gpu_metrics['power_usage'] is not None:
|
246 |
+
self.history[idx]['power_usage'].append(gpu_metrics['power_usage'])
|
247 |
+
else:
|
248 |
+
self.history[idx]['power_usage'].append(0)
|
249 |
+
|
250 |
+
# Store power limit in history (static but kept for consistency)
|
251 |
+
info = self.device_info[idx]
|
252 |
+
if 'power_limit' in info and info['power_limit'] is not None:
|
253 |
+
self.history[idx]['power_limit'].append(info['power_limit'])
|
254 |
+
else:
|
255 |
+
self.history[idx]['power_limit'].append(0)
|
256 |
+
|
257 |
+
def start_monitoring(self):
|
258 |
+
"""Start background thread for collecting GPU metrics"""
|
259 |
+
if self.is_running:
|
260 |
+
logger.warning("GPU monitoring thread already running")
|
261 |
+
return
|
262 |
+
|
263 |
+
if not self.has_nvidia_gpus:
|
264 |
+
logger.info("No NVIDIA GPUs found, not starting monitoring thread")
|
265 |
+
return
|
266 |
+
|
267 |
+
import threading
|
268 |
+
|
269 |
+
self.is_running = True
|
270 |
+
|
271 |
+
def _monitor_loop():
|
272 |
+
while self.is_running:
|
273 |
+
try:
|
274 |
+
self.update_history()
|
275 |
+
time.sleep(self.sample_interval)
|
276 |
+
except Exception as e:
|
277 |
+
logger.error(f"Error in GPU monitoring thread: {str(e)}", exc_info=True)
|
278 |
+
time.sleep(self.sample_interval)
|
279 |
+
|
280 |
+
self.thread = threading.Thread(target=_monitor_loop, daemon=True)
|
281 |
+
self.thread.start()
|
282 |
+
logger.info("GPU monitoring thread started")
|
283 |
+
|
284 |
+
def stop_monitoring(self):
|
285 |
+
"""Stop the GPU monitoring thread"""
|
286 |
+
if not self.is_running:
|
287 |
+
return
|
288 |
+
|
289 |
+
self.is_running = False
|
290 |
+
if self.thread:
|
291 |
+
self.thread.join(timeout=1.0)
|
292 |
+
logger.info("GPU monitoring thread stopped")
|
293 |
+
|
294 |
+
def get_gpu_info(self) -> List[Dict[str, Any]]:
|
295 |
+
"""Get information about all available GPUs
|
296 |
+
|
297 |
+
Returns:
|
298 |
+
List of dictionaries with GPU information
|
299 |
+
"""
|
300 |
+
return self.device_info
|
301 |
+
|
302 |
+
def get_current_metrics(self) -> List[Dict[str, Any]]:
|
303 |
+
"""Get current metrics for all GPUs
|
304 |
+
|
305 |
+
Returns:
|
306 |
+
List of dictionaries with current GPU metrics
|
307 |
+
"""
|
308 |
+
return self.collect_gpu_metrics()
|
309 |
+
|
310 |
+
def generate_utilization_plot(self, gpu_index: int) -> plt.Figure:
|
311 |
+
"""Generate a plot of GPU utilization over time
|
312 |
+
|
313 |
+
Args:
|
314 |
+
gpu_index: Index of the GPU to plot
|
315 |
+
|
316 |
+
Returns:
|
317 |
+
Matplotlib figure with utilization plot
|
318 |
+
"""
|
319 |
+
plt.close('all') # Close all existing figures
|
320 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
321 |
+
|
322 |
+
if not self.has_nvidia_gpus or gpu_index not in self.history:
|
323 |
+
ax.set_title(f"No data available for GPU {gpu_index}")
|
324 |
+
return fig
|
325 |
+
|
326 |
+
history = self.history[gpu_index]
|
327 |
+
if not history['timestamps']:
|
328 |
+
ax.set_title(f"No history data for GPU {gpu_index}")
|
329 |
+
return fig
|
330 |
+
|
331 |
+
# Convert timestamps to strings
|
332 |
+
x = [t.strftime('%H:%M:%S') for t in history['timestamps']]
|
333 |
+
|
334 |
+
# If we have many points, show fewer labels for readability
|
335 |
+
if len(x) > 10:
|
336 |
+
step = len(x) // 10
|
337 |
+
ax.set_xticks(range(0, len(x), step))
|
338 |
+
ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45)
|
339 |
+
|
340 |
+
# Plot utilization
|
341 |
+
ax.plot(x, list(history['utilization']), 'b-', label='GPU Utilization %')
|
342 |
+
ax.set_ylim(0, 100)
|
343 |
+
|
344 |
+
# Add temperature on secondary y-axis
|
345 |
+
ax2 = ax.twinx()
|
346 |
+
ax2.plot(x, list(history['temperature']), 'r-', label='Temperature °C')
|
347 |
+
ax2.set_ylabel('Temperature (°C)', color='r')
|
348 |
+
ax2.tick_params(axis='y', colors='r')
|
349 |
+
|
350 |
+
# Set labels and title
|
351 |
+
ax.set_title(f'GPU {gpu_index} Utilization Over Time')
|
352 |
+
ax.set_xlabel('Time')
|
353 |
+
ax.set_ylabel('Utilization %')
|
354 |
+
ax.grid(True, alpha=0.3)
|
355 |
+
|
356 |
+
# Add legend
|
357 |
+
lines, labels = ax.get_legend_handles_labels()
|
358 |
+
lines2, labels2 = ax2.get_legend_handles_labels()
|
359 |
+
ax.legend(lines + lines2, labels + labels2, loc='upper left')
|
360 |
+
|
361 |
+
plt.tight_layout()
|
362 |
+
return fig
|
363 |
+
|
364 |
+
def generate_memory_plot(self, gpu_index: int) -> plt.Figure:
|
365 |
+
"""Generate a plot of GPU memory usage over time
|
366 |
+
|
367 |
+
Args:
|
368 |
+
gpu_index: Index of the GPU to plot
|
369 |
+
|
370 |
+
Returns:
|
371 |
+
Matplotlib figure with memory usage plot
|
372 |
+
"""
|
373 |
+
plt.close('all') # Close all existing figures
|
374 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
375 |
+
|
376 |
+
if not self.has_nvidia_gpus or gpu_index not in self.history:
|
377 |
+
ax.set_title(f"No data available for GPU {gpu_index}")
|
378 |
+
return fig
|
379 |
+
|
380 |
+
history = self.history[gpu_index]
|
381 |
+
if not history['timestamps']:
|
382 |
+
ax.set_title(f"No history data for GPU {gpu_index}")
|
383 |
+
return fig
|
384 |
+
|
385 |
+
# Convert timestamps to strings
|
386 |
+
x = [t.strftime('%H:%M:%S') for t in history['timestamps']]
|
387 |
+
|
388 |
+
# If we have many points, show fewer labels for readability
|
389 |
+
if len(x) > 10:
|
390 |
+
step = len(x) // 10
|
391 |
+
ax.set_xticks(range(0, len(x), step))
|
392 |
+
ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45)
|
393 |
+
|
394 |
+
# Plot memory percentage
|
395 |
+
ax.plot(x, list(history['memory_percent']), 'g-', label='Memory Usage %')
|
396 |
+
ax.set_ylim(0, 100)
|
397 |
+
|
398 |
+
# Add absolute memory values on secondary y-axis (convert to GB)
|
399 |
+
ax2 = ax.twinx()
|
400 |
+
memory_used_gb = [m / (1024**3) for m in history['memory_used']]
|
401 |
+
memory_total_gb = [m / (1024**3) for m in history['memory_total']]
|
402 |
+
|
403 |
+
ax2.plot(x, memory_used_gb, 'm--', label='Used (GB)')
|
404 |
+
ax2.set_ylabel('Memory (GB)')
|
405 |
+
|
406 |
+
# Set labels and title
|
407 |
+
ax.set_title(f'GPU {gpu_index} Memory Usage Over Time')
|
408 |
+
ax.set_xlabel('Time')
|
409 |
+
ax.set_ylabel('Usage %')
|
410 |
+
ax.grid(True, alpha=0.3)
|
411 |
+
|
412 |
+
# Add legend
|
413 |
+
lines, labels = ax.get_legend_handles_labels()
|
414 |
+
lines2, labels2 = ax2.get_legend_handles_labels()
|
415 |
+
ax.legend(lines + lines2, labels + labels2, loc='upper left')
|
416 |
+
|
417 |
+
plt.tight_layout()
|
418 |
+
return fig
|
419 |
+
|
420 |
+
def generate_power_plot(self, gpu_index: int) -> plt.Figure:
|
421 |
+
"""Generate a plot of GPU power usage over time
|
422 |
+
|
423 |
+
Args:
|
424 |
+
gpu_index: Index of the GPU to plot
|
425 |
+
|
426 |
+
Returns:
|
427 |
+
Matplotlib figure with power usage plot
|
428 |
+
"""
|
429 |
+
plt.close('all') # Close all existing figures
|
430 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
431 |
+
|
432 |
+
if not self.has_nvidia_gpus or gpu_index not in self.history:
|
433 |
+
ax.set_title(f"No data available for GPU {gpu_index}")
|
434 |
+
return fig
|
435 |
+
|
436 |
+
history = self.history[gpu_index]
|
437 |
+
if not history['timestamps'] or not any(history['power_usage']):
|
438 |
+
ax.set_title(f"No power data for GPU {gpu_index}")
|
439 |
+
return fig
|
440 |
+
|
441 |
+
# Convert timestamps to strings
|
442 |
+
x = [t.strftime('%H:%M:%S') for t in history['timestamps']]
|
443 |
+
|
444 |
+
# If we have many points, show fewer labels for readability
|
445 |
+
if len(x) > 10:
|
446 |
+
step = len(x) // 10
|
447 |
+
ax.set_xticks(range(0, len(x), step))
|
448 |
+
ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45)
|
449 |
+
|
450 |
+
# Plot power usage
|
451 |
+
power_usage = list(history['power_usage'])
|
452 |
+
if any(power_usage): # Only plot if we have actual power data
|
453 |
+
ax.plot(x, power_usage, 'b-', label='Power Usage (W)')
|
454 |
+
|
455 |
+
# Get power limit if available
|
456 |
+
power_limit = list(history['power_limit'])
|
457 |
+
if any(power_limit): # Only plot if we have power limit data
|
458 |
+
# Show power limit as horizontal line
|
459 |
+
limit = max(power_limit) # Should be constant, but take max just in case
|
460 |
+
if limit > 0:
|
461 |
+
ax.axhline(y=limit, color='r', linestyle='--', label=f'Power Limit ({limit}W)')
|
462 |
+
|
463 |
+
# Set labels and title
|
464 |
+
ax.set_title(f'GPU {gpu_index} Power Usage Over Time')
|
465 |
+
ax.set_xlabel('Time')
|
466 |
+
ax.set_ylabel('Power (Watts)')
|
467 |
+
ax.grid(True, alpha=0.3)
|
468 |
+
ax.legend(loc='upper left')
|
469 |
+
else:
|
470 |
+
ax.set_title(f"Power data not available for GPU {gpu_index}")
|
471 |
+
|
472 |
+
plt.tight_layout()
|
473 |
+
return fig
|
474 |
+
|
475 |
+
def shutdown(self):
|
476 |
+
"""Clean up resources when shutting down"""
|
477 |
+
self.stop_monitoring()
|
478 |
+
|
479 |
+
# Shutdown NVML if it was initialized
|
480 |
+
if PYNVML_AVAILABLE and self.has_nvidia_gpus:
|
481 |
+
try:
|
482 |
+
pynvml.nvmlShutdown()
|
483 |
+
logger.info("NVML shutdown complete")
|
484 |
+
except Exception as e:
|
485 |
+
logger.error(f"Error during NVML shutdown: {str(e)}")
|
vms/ui/monitoring/services/monitoring.py
CHANGED
@@ -21,6 +21,8 @@ import matplotlib.pyplot as plt
|
|
21 |
|
22 |
import numpy as np
|
23 |
|
|
|
|
|
24 |
logger = logging.getLogger(__name__)
|
25 |
logger.setLevel(logging.INFO)
|
26 |
|
@@ -51,6 +53,9 @@ class MonitoringService:
|
|
51 |
# Per-core CPU history
|
52 |
self.cpu_cores_percent = {}
|
53 |
|
|
|
|
|
|
|
54 |
# Track if the monitoring thread is running
|
55 |
self.is_running = False
|
56 |
self.thread = None
|
@@ -124,6 +129,9 @@ class MonitoringService:
|
|
124 |
return
|
125 |
|
126 |
self.is_running = True
|
|
|
|
|
|
|
127 |
|
128 |
def _monitor_loop():
|
129 |
while self.is_running:
|
@@ -143,8 +151,12 @@ class MonitoringService:
|
|
143 |
"""Stop the monitoring thread"""
|
144 |
if not self.is_running:
|
145 |
return
|
146 |
-
|
147 |
self.is_running = False
|
|
|
|
|
|
|
|
|
148 |
if self.thread:
|
149 |
self.thread.join(timeout=1.0)
|
150 |
logger.info("System monitoring thread stopped")
|
|
|
21 |
|
22 |
import numpy as np
|
23 |
|
24 |
+
from vms.ui.monitoring.services.gpu import GPUMonitoringService
|
25 |
+
|
26 |
logger = logging.getLogger(__name__)
|
27 |
logger.setLevel(logging.INFO)
|
28 |
|
|
|
53 |
# Per-core CPU history
|
54 |
self.cpu_cores_percent = {}
|
55 |
|
56 |
+
# Initialize GPU monitoring service
|
57 |
+
self.gpu = GPUMonitoringService(history_minutes=history_minutes, sample_interval=sample_interval)
|
58 |
+
|
59 |
# Track if the monitoring thread is running
|
60 |
self.is_running = False
|
61 |
self.thread = None
|
|
|
129 |
return
|
130 |
|
131 |
self.is_running = True
|
132 |
+
|
133 |
+
# Start GPU monitoring if available
|
134 |
+
self.gpu.start_monitoring()
|
135 |
|
136 |
def _monitor_loop():
|
137 |
while self.is_running:
|
|
|
151 |
"""Stop the monitoring thread"""
|
152 |
if not self.is_running:
|
153 |
return
|
154 |
+
|
155 |
self.is_running = False
|
156 |
+
|
157 |
+
# Stop GPU monitoring
|
158 |
+
self.gpu.stop_monitoring()
|
159 |
+
|
160 |
if self.thread:
|
161 |
self.thread.join(timeout=1.0)
|
162 |
logger.info("System monitoring thread stopped")
|
vms/ui/monitoring/tabs/gpu_tab.py
ADDED
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
GPU monitoring tab for Video Model Studio UI.
|
3 |
+
Displays detailed GPU metrics and visualizations.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import time
|
8 |
+
import logging
|
9 |
+
from pathlib import Path
|
10 |
+
import os
|
11 |
+
from typing import Dict, Any, List, Optional, Tuple
|
12 |
+
from datetime import datetime, timedelta
|
13 |
+
|
14 |
+
from vms.utils.base_tab import BaseTab
|
15 |
+
from vms.ui.monitoring.utils import human_readable_size
|
16 |
+
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
logger.setLevel(logging.INFO)
|
19 |
+
|
20 |
+
class GPUTab(BaseTab):
|
21 |
+
"""Tab for GPU-specific monitoring and statistics"""
|
22 |
+
|
23 |
+
def __init__(self, app_state):
|
24 |
+
super().__init__(app_state)
|
25 |
+
self.id = "GPU_tab"
|
26 |
+
self.title = "GPU Stats"
|
27 |
+
self.refresh_interval = 5
|
28 |
+
self.selected_gpu = 0
|
29 |
+
|
30 |
+
def create(self, parent=None) -> gr.TabItem:
|
31 |
+
"""Create the GPU tab UI components"""
|
32 |
+
with gr.TabItem(self.title, id=self.id) as tab:
|
33 |
+
with gr.Row():
|
34 |
+
gr.Markdown("## 🖥️ GPU Monitoring")
|
35 |
+
|
36 |
+
# No GPUs available message (hidden by default)
|
37 |
+
with gr.Row(visible=not self.app.monitoring.gpu.has_nvidia_gpus):
|
38 |
+
with gr.Column():
|
39 |
+
gr.Markdown("### No NVIDIA GPUs detected")
|
40 |
+
gr.Markdown("GPU monitoring is only available for NVIDIA GPUs. If you have NVIDIA GPUs installed, ensure the drivers are properly configured.")
|
41 |
+
|
42 |
+
# GPU content (only visible if GPUs are available)
|
43 |
+
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus):
|
44 |
+
# GPU selector if multiple GPUs
|
45 |
+
if self.app.monitoring.gpu.gpu_count > 1:
|
46 |
+
with gr.Column(scale=1):
|
47 |
+
gpu_options = [f"GPU {i}" for i in range(self.app.monitoring.gpu.gpu_count)]
|
48 |
+
self.components["gpu_selector"] = gr.Dropdown(
|
49 |
+
choices=gpu_options,
|
50 |
+
value=gpu_options[0] if gpu_options else None,
|
51 |
+
label="Select GPU",
|
52 |
+
interactive=True
|
53 |
+
)
|
54 |
+
|
55 |
+
# Current metrics
|
56 |
+
with gr.Column(scale=3):
|
57 |
+
self.components["current_metrics"] = gr.Markdown("Loading GPU metrics...")
|
58 |
+
|
59 |
+
# Display GPU metrics in tabs
|
60 |
+
with gr.Tabs(visible=self.app.monitoring.gpu.has_nvidia_gpus) as metrics_tabs:
|
61 |
+
with gr.Tab(label="Utilization") as util_tab:
|
62 |
+
self.components["utilization_plot"] = gr.Plot()
|
63 |
+
|
64 |
+
with gr.Tab(label="Memory") as memory_tab:
|
65 |
+
self.components["memory_plot"] = gr.Plot()
|
66 |
+
|
67 |
+
with gr.Tab(label="Power") as power_tab:
|
68 |
+
self.components["power_plot"] = gr.Plot()
|
69 |
+
|
70 |
+
# Process information
|
71 |
+
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus):
|
72 |
+
with gr.Column():
|
73 |
+
gr.Markdown("### Active Processes")
|
74 |
+
self.components["process_info"] = gr.Markdown("Loading process information...")
|
75 |
+
|
76 |
+
# GPU information summary
|
77 |
+
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus):
|
78 |
+
with gr.Column():
|
79 |
+
gr.Markdown("### GPU Information")
|
80 |
+
self.components["gpu_info"] = gr.Markdown("Loading GPU information...")
|
81 |
+
|
82 |
+
# Toggle for enabling/disabling auto-refresh
|
83 |
+
with gr.Row():
|
84 |
+
self.components["auto_refresh"] = gr.Checkbox(
|
85 |
+
label=f"Auto refresh (every {self.refresh_interval} seconds)",
|
86 |
+
value=True,
|
87 |
+
info="Automatically refresh GPU metrics"
|
88 |
+
)
|
89 |
+
self.components["refresh_btn"] = gr.Button("Refresh Now")
|
90 |
+
|
91 |
+
# Timer for auto-refresh
|
92 |
+
self.components["refresh_timer"] = gr.Timer(
|
93 |
+
value=self.refresh_interval
|
94 |
+
)
|
95 |
+
|
96 |
+
return tab
|
97 |
+
|
98 |
+
def connect_events(self) -> None:
|
99 |
+
"""Connect event handlers to UI components"""
|
100 |
+
# GPU selector (if multiple GPUs)
|
101 |
+
if self.app.monitoring.gpu.gpu_count > 1 and "gpu_selector" in self.components:
|
102 |
+
self.components["gpu_selector"].change(
|
103 |
+
fn=self.update_selected_gpu,
|
104 |
+
inputs=[self.components["gpu_selector"]],
|
105 |
+
outputs=[
|
106 |
+
self.components["current_metrics"],
|
107 |
+
self.components["utilization_plot"],
|
108 |
+
self.components["memory_plot"],
|
109 |
+
self.components["power_plot"],
|
110 |
+
self.components["process_info"],
|
111 |
+
self.components["gpu_info"]
|
112 |
+
]
|
113 |
+
)
|
114 |
+
|
115 |
+
# Manual refresh button
|
116 |
+
self.components["refresh_btn"].click(
|
117 |
+
fn=self.refresh_all,
|
118 |
+
outputs=[
|
119 |
+
self.components["current_metrics"],
|
120 |
+
self.components["utilization_plot"],
|
121 |
+
self.components["memory_plot"],
|
122 |
+
self.components["power_plot"],
|
123 |
+
self.components["process_info"],
|
124 |
+
self.components["gpu_info"]
|
125 |
+
]
|
126 |
+
)
|
127 |
+
|
128 |
+
# Auto-refresh timer
|
129 |
+
self.components["refresh_timer"].tick(
|
130 |
+
fn=self.conditional_refresh,
|
131 |
+
inputs=[self.components["auto_refresh"]],
|
132 |
+
outputs=[
|
133 |
+
self.components["current_metrics"],
|
134 |
+
self.components["utilization_plot"],
|
135 |
+
self.components["memory_plot"],
|
136 |
+
self.components["power_plot"],
|
137 |
+
self.components["process_info"],
|
138 |
+
self.components["gpu_info"]
|
139 |
+
]
|
140 |
+
)
|
141 |
+
|
142 |
+
def on_enter(self):
|
143 |
+
"""Called when the tab is selected"""
|
144 |
+
# Trigger initial refresh
|
145 |
+
return self.refresh_all()
|
146 |
+
|
147 |
+
def update_selected_gpu(self, gpu_selector: str) -> Tuple:
|
148 |
+
"""Update the selected GPU and refresh data
|
149 |
+
|
150 |
+
Args:
|
151 |
+
gpu_selector: Selected GPU string ("GPU X")
|
152 |
+
|
153 |
+
Returns:
|
154 |
+
Updated components
|
155 |
+
"""
|
156 |
+
# Extract GPU index from selector string
|
157 |
+
try:
|
158 |
+
self.selected_gpu = int(gpu_selector.replace("GPU ", ""))
|
159 |
+
except (ValueError, AttributeError):
|
160 |
+
self.selected_gpu = 0
|
161 |
+
|
162 |
+
# Refresh all components with the new selected GPU
|
163 |
+
return self.refresh_all()
|
164 |
+
|
165 |
+
def conditional_refresh(self, auto_refresh: bool) -> Tuple:
|
166 |
+
"""Only refresh if auto-refresh is enabled
|
167 |
+
|
168 |
+
Args:
|
169 |
+
auto_refresh: Whether auto-refresh is enabled
|
170 |
+
|
171 |
+
Returns:
|
172 |
+
Updated components or unchanged components
|
173 |
+
"""
|
174 |
+
if auto_refresh:
|
175 |
+
return self.refresh_all()
|
176 |
+
|
177 |
+
# Return current values unchanged if auto-refresh is disabled
|
178 |
+
return (
|
179 |
+
self.components["current_metrics"].value,
|
180 |
+
self.components["utilization_plot"].value,
|
181 |
+
self.components["memory_plot"].value,
|
182 |
+
self.components["power_plot"].value,
|
183 |
+
self.components["process_info"].value,
|
184 |
+
self.components["gpu_info"].value
|
185 |
+
)
|
186 |
+
|
187 |
+
def refresh_all(self) -> Tuple:
|
188 |
+
"""Refresh all GPU monitoring components
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
Updated values for all components
|
192 |
+
"""
|
193 |
+
try:
|
194 |
+
if not self.app.monitoring.gpu.has_nvidia_gpus:
|
195 |
+
return (
|
196 |
+
"No NVIDIA GPUs detected",
|
197 |
+
None,
|
198 |
+
None,
|
199 |
+
None,
|
200 |
+
"No process information available",
|
201 |
+
"No GPU information available"
|
202 |
+
)
|
203 |
+
|
204 |
+
# Get current metrics for the selected GPU
|
205 |
+
all_metrics = self.app.monitoring.gpu.get_current_metrics()
|
206 |
+
if not all_metrics or self.selected_gpu >= len(all_metrics):
|
207 |
+
return (
|
208 |
+
"GPU metrics not available",
|
209 |
+
None,
|
210 |
+
None,
|
211 |
+
None,
|
212 |
+
"No process information available",
|
213 |
+
"No GPU information available"
|
214 |
+
)
|
215 |
+
|
216 |
+
# Get selected GPU metrics
|
217 |
+
gpu_metrics = all_metrics[self.selected_gpu]
|
218 |
+
|
219 |
+
# Format current metrics as markdown
|
220 |
+
metrics_html = self.format_current_metrics(gpu_metrics)
|
221 |
+
|
222 |
+
# Format process information
|
223 |
+
process_info_html = self.format_process_info(gpu_metrics)
|
224 |
+
|
225 |
+
# Format GPU information
|
226 |
+
gpu_info = self.app.monitoring.gpu.get_gpu_info()
|
227 |
+
gpu_info_html = self.format_gpu_info(gpu_info[self.selected_gpu] if self.selected_gpu < len(gpu_info) else {})
|
228 |
+
|
229 |
+
# Generate plots
|
230 |
+
utilization_plot = self.app.monitoring.gpu.generate_utilization_plot(self.selected_gpu)
|
231 |
+
memory_plot = self.app.monitoring.gpu.generate_memory_plot(self.selected_gpu)
|
232 |
+
power_plot = self.app.monitoring.gpu.generate_power_plot(self.selected_gpu)
|
233 |
+
|
234 |
+
return (
|
235 |
+
metrics_html,
|
236 |
+
utilization_plot,
|
237 |
+
memory_plot,
|
238 |
+
power_plot,
|
239 |
+
process_info_html,
|
240 |
+
gpu_info_html
|
241 |
+
)
|
242 |
+
|
243 |
+
except Exception as e:
|
244 |
+
logger.error(f"Error refreshing GPU data: {str(e)}", exc_info=True)
|
245 |
+
error_msg = f"Error retrieving GPU data: {str(e)}"
|
246 |
+
return (
|
247 |
+
error_msg,
|
248 |
+
None,
|
249 |
+
None,
|
250 |
+
None,
|
251 |
+
error_msg,
|
252 |
+
error_msg
|
253 |
+
)
|
254 |
+
|
255 |
+
def format_current_metrics(self, metrics: Dict[str, Any]) -> str:
|
256 |
+
"""Format current GPU metrics as HTML/Markdown
|
257 |
+
|
258 |
+
Args:
|
259 |
+
metrics: Current metrics dictionary
|
260 |
+
|
261 |
+
Returns:
|
262 |
+
Formatted HTML/Markdown string
|
263 |
+
"""
|
264 |
+
if 'error' in metrics:
|
265 |
+
return f"Error retrieving GPU metrics: {metrics['error']}"
|
266 |
+
|
267 |
+
# Format timestamp
|
268 |
+
if isinstance(metrics.get('timestamp'), datetime):
|
269 |
+
timestamp_str = metrics['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
|
270 |
+
else:
|
271 |
+
timestamp_str = "Unknown"
|
272 |
+
|
273 |
+
# Style for GPU utilization
|
274 |
+
util_style = "color: green;"
|
275 |
+
if metrics.get('utilization_gpu', 0) > 90:
|
276 |
+
util_style = "color: red; font-weight: bold;"
|
277 |
+
elif metrics.get('utilization_gpu', 0) > 70:
|
278 |
+
util_style = "color: orange;"
|
279 |
+
|
280 |
+
# Style for memory usage
|
281 |
+
mem_style = "color: green;"
|
282 |
+
if metrics.get('memory_percent', 0) > 90:
|
283 |
+
mem_style = "color: red; font-weight: bold;"
|
284 |
+
elif metrics.get('memory_percent', 0) > 70:
|
285 |
+
mem_style = "color: orange;"
|
286 |
+
|
287 |
+
# Style for temperature
|
288 |
+
temp_style = "color: green;"
|
289 |
+
temp = metrics.get('temperature', 0)
|
290 |
+
if temp > 85:
|
291 |
+
temp_style = "color: red; font-weight: bold;"
|
292 |
+
elif temp > 75:
|
293 |
+
temp_style = "color: orange;"
|
294 |
+
|
295 |
+
# Memory usage in GB
|
296 |
+
memory_used_gb = metrics.get('memory_used', 0) / (1024**3)
|
297 |
+
memory_total_gb = metrics.get('memory_total', 0) / (1024**3)
|
298 |
+
|
299 |
+
# Power usage and limit
|
300 |
+
power_html = ""
|
301 |
+
if metrics.get('power_usage') is not None:
|
302 |
+
power_html = f"**Power Usage:** {metrics['power_usage']:.1f}W\n"
|
303 |
+
|
304 |
+
html = f"""
|
305 |
+
### Current Status as of {timestamp_str}
|
306 |
+
|
307 |
+
**GPU Utilization:** <span style="{util_style}">{metrics.get('utilization_gpu', 0):.1f}%</span>
|
308 |
+
**Memory Usage:** <span style="{mem_style}">{metrics.get('memory_percent', 0):.1f}% ({memory_used_gb:.2f}/{memory_total_gb:.2f} GB)</span>
|
309 |
+
**Temperature:** <span style="{temp_style}">{metrics.get('temperature', 0)}°C</span>
|
310 |
+
{power_html}
|
311 |
+
"""
|
312 |
+
return html
|
313 |
+
def format_process_info(self, metrics: Dict[str, Any]) -> str:
|
314 |
+
"""Format GPU process information as HTML/Markdown
|
315 |
+
|
316 |
+
Args:
|
317 |
+
metrics: Current metrics dictionary with process information
|
318 |
+
|
319 |
+
Returns:
|
320 |
+
Formatted HTML/Markdown string
|
321 |
+
"""
|
322 |
+
if 'error' in metrics:
|
323 |
+
return "Process information not available"
|
324 |
+
|
325 |
+
processes = metrics.get('processes', [])
|
326 |
+
if not processes:
|
327 |
+
return "No active processes using this GPU"
|
328 |
+
|
329 |
+
# Sort processes by memory usage (descending)
|
330 |
+
sorted_processes = sorted(processes, key=lambda p: p.get('memory_used', 0), reverse=True)
|
331 |
+
|
332 |
+
html = "| PID | Process Name | Memory Usage |\n"
|
333 |
+
html += "|-----|-------------|-------------|\n"
|
334 |
+
|
335 |
+
for proc in sorted_processes:
|
336 |
+
pid = proc.get('pid', 'Unknown')
|
337 |
+
name = proc.get('name', 'Unknown')
|
338 |
+
mem_mb = proc.get('memory_used', 0) / (1024**2) # Convert to MB
|
339 |
+
|
340 |
+
html += f"| {pid} | {name} | {mem_mb:.1f} MB |\n"
|
341 |
+
|
342 |
+
return html
|
343 |
+
|
344 |
+
def format_gpu_info(self, info: Dict[str, Any]) -> str:
|
345 |
+
"""Format GPU information as HTML/Markdown
|
346 |
+
|
347 |
+
Args:
|
348 |
+
info: GPU information dictionary
|
349 |
+
|
350 |
+
Returns:
|
351 |
+
Formatted HTML/Markdown string
|
352 |
+
"""
|
353 |
+
if 'error' in info:
|
354 |
+
return f"GPU information not available: {info.get('error', 'Unknown error')}"
|
355 |
+
|
356 |
+
# Format memory in GB
|
357 |
+
memory_total_gb = info.get('memory_total', 0) / (1024**3)
|
358 |
+
|
359 |
+
html = f"""
|
360 |
+
**Name:** {info.get('name', 'Unknown')}
|
361 |
+
**Memory:** {memory_total_gb:.2f} GB
|
362 |
+
**UUID:** {info.get('uuid', 'N/A')}
|
363 |
+
**Compute Capability:** {info.get('compute_capability', 'N/A')}
|
364 |
+
"""
|
365 |
+
|
366 |
+
# Add power limit if available
|
367 |
+
if info.get('power_limit') is not None:
|
368 |
+
html += f"**Power Limit:** {info['power_limit']:.1f}W\n"
|
369 |
+
|
370 |
+
return html
|
vms/ui/project/services/training.py
CHANGED
@@ -38,7 +38,8 @@ from vms.config import (
|
|
38 |
DEFAULT_MAX_GPUS,
|
39 |
DEFAULT_PRECOMPUTATION_ITEMS,
|
40 |
DEFAULT_NB_TRAINING_STEPS,
|
41 |
-
DEFAULT_NB_LR_WARMUP_STEPS
|
|
|
42 |
)
|
43 |
from vms.utils import (
|
44 |
get_available_gpu_count,
|
@@ -151,7 +152,8 @@ class TrainingService:
|
|
151 |
"training_preset": list(TRAINING_PRESETS.keys())[0],
|
152 |
"num_gpus": DEFAULT_NUM_GPUS,
|
153 |
"precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
|
154 |
-
"lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS
|
|
|
155 |
}
|
156 |
|
157 |
# Copy default values first
|
@@ -231,7 +233,8 @@ class TrainingService:
|
|
231 |
"training_preset": list(TRAINING_PRESETS.keys())[0],
|
232 |
"num_gpus": DEFAULT_NUM_GPUS,
|
233 |
"precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
|
234 |
-
"lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS
|
|
|
235 |
}
|
236 |
|
237 |
# Use lock for reading too to avoid reading during a write
|
@@ -369,6 +372,7 @@ class TrainingService:
|
|
369 |
# Default state with all required values
|
370 |
default_state = {
|
371 |
"model_type": list(MODEL_TYPES.keys())[0],
|
|
|
372 |
"training_type": list(TRAINING_TYPES.keys())[0],
|
373 |
"lora_rank": DEFAULT_LORA_RANK_STR,
|
374 |
"lora_alpha": DEFAULT_LORA_ALPHA_STR,
|
@@ -379,7 +383,8 @@ class TrainingService:
|
|
379 |
"training_preset": list(TRAINING_PRESETS.keys())[0],
|
380 |
"num_gpus": DEFAULT_NUM_GPUS,
|
381 |
"precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
|
382 |
-
"lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS
|
|
|
383 |
}
|
384 |
|
385 |
# If file doesn't exist, create it with default values
|
@@ -1144,12 +1149,15 @@ class TrainingService:
|
|
1144 |
"batch_size": params.get('batch_size', DEFAULT_BATCH_SIZE),
|
1145 |
"learning_rate": params.get('learning_rate', DEFAULT_LEARNING_RATE),
|
1146 |
"save_iterations": params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
|
1147 |
-
"training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
|
|
|
1148 |
})
|
1149 |
|
1150 |
# Check if we should auto-recover (immediate restart)
|
1151 |
-
|
1152 |
-
|
|
|
|
|
1153 |
if auto_recover:
|
1154 |
try:
|
1155 |
result = self.start_training(
|
|
|
38 |
DEFAULT_MAX_GPUS,
|
39 |
DEFAULT_PRECOMPUTATION_ITEMS,
|
40 |
DEFAULT_NB_TRAINING_STEPS,
|
41 |
+
DEFAULT_NB_LR_WARMUP_STEPS,
|
42 |
+
DEFAULT_AUTO_RESUME
|
43 |
)
|
44 |
from vms.utils import (
|
45 |
get_available_gpu_count,
|
|
|
152 |
"training_preset": list(TRAINING_PRESETS.keys())[0],
|
153 |
"num_gpus": DEFAULT_NUM_GPUS,
|
154 |
"precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
|
155 |
+
"lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS,
|
156 |
+
"auto_resume": False
|
157 |
}
|
158 |
|
159 |
# Copy default values first
|
|
|
233 |
"training_preset": list(TRAINING_PRESETS.keys())[0],
|
234 |
"num_gpus": DEFAULT_NUM_GPUS,
|
235 |
"precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
|
236 |
+
"lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS,
|
237 |
+
"auto_resume": DEFAULT_AUTO_RESUME
|
238 |
}
|
239 |
|
240 |
# Use lock for reading too to avoid reading during a write
|
|
|
372 |
# Default state with all required values
|
373 |
default_state = {
|
374 |
"model_type": list(MODEL_TYPES.keys())[0],
|
375 |
+
"model_version": "",
|
376 |
"training_type": list(TRAINING_TYPES.keys())[0],
|
377 |
"lora_rank": DEFAULT_LORA_RANK_STR,
|
378 |
"lora_alpha": DEFAULT_LORA_ALPHA_STR,
|
|
|
383 |
"training_preset": list(TRAINING_PRESETS.keys())[0],
|
384 |
"num_gpus": DEFAULT_NUM_GPUS,
|
385 |
"precomputation_items": DEFAULT_PRECOMPUTATION_ITEMS,
|
386 |
+
"lr_warmup_steps": DEFAULT_NB_LR_WARMUP_STEPS,
|
387 |
+
"auto_resume": False
|
388 |
}
|
389 |
|
390 |
# If file doesn't exist, create it with default values
|
|
|
1149 |
"batch_size": params.get('batch_size', DEFAULT_BATCH_SIZE),
|
1150 |
"learning_rate": params.get('learning_rate', DEFAULT_LEARNING_RATE),
|
1151 |
"save_iterations": params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
|
1152 |
+
"training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0]),
|
1153 |
+
"auto_resume_checkbox": ui_state.get("auto_resume", DEFAULT_AUTO_RESUME)
|
1154 |
})
|
1155 |
|
1156 |
# Check if we should auto-recover (immediate restart)
|
1157 |
+
ui_state = self.load_ui_state()
|
1158 |
+
auto_recover = ui_state.get("auto_resume", DEFAULT_AUTO_RESUME)
|
1159 |
+
logger.info(f"Auto-resume is {'enabled' if auto_recover else 'disabled'}")
|
1160 |
+
|
1161 |
if auto_recover:
|
1162 |
try:
|
1163 |
result = self.start_training(
|
vms/ui/project/tabs/train_tab.py
CHANGED
@@ -26,6 +26,7 @@ from vms.config import (
|
|
26 |
DEFAULT_PRECOMPUTATION_ITEMS,
|
27 |
DEFAULT_NB_TRAINING_STEPS,
|
28 |
DEFAULT_NB_LR_WARMUP_STEPS,
|
|
|
29 |
)
|
30 |
|
31 |
logger = logging.getLogger(__name__)
|
@@ -231,6 +232,13 @@ class TrainTab(BaseTab):
|
|
231 |
interactive=has_checkpoints
|
232 |
)
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
with gr.Row():
|
235 |
with gr.Column():
|
236 |
self.components["status_box"] = gr.Textbox(
|
@@ -381,6 +389,12 @@ class TrainTab(BaseTab):
|
|
381 |
]
|
382 |
)
|
383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
# Add in the connect_events() method:
|
385 |
self.components["num_gpus"].change(
|
386 |
fn=lambda v: self.app.update_ui_state(num_gpus=v),
|
|
|
26 |
DEFAULT_PRECOMPUTATION_ITEMS,
|
27 |
DEFAULT_NB_TRAINING_STEPS,
|
28 |
DEFAULT_NB_LR_WARMUP_STEPS,
|
29 |
+
DEFAULT_AUTO_RESUME
|
30 |
)
|
31 |
|
32 |
logger = logging.getLogger(__name__)
|
|
|
232 |
interactive=has_checkpoints
|
233 |
)
|
234 |
|
235 |
+
with gr.Row():
|
236 |
+
self.components["auto_resume_checkbox"] = gr.Checkbox(
|
237 |
+
label="Automatically continue training in case of server reboot.",
|
238 |
+
value=DEFAULT_AUTO_RESUME,
|
239 |
+
info="When enabled, training will automatically resume from the latest checkpoint after app restart"
|
240 |
+
)
|
241 |
+
|
242 |
with gr.Row():
|
243 |
with gr.Column():
|
244 |
self.components["status_box"] = gr.Textbox(
|
|
|
389 |
]
|
390 |
)
|
391 |
|
392 |
+
self.components["auto_resume_checkbox"].change(
|
393 |
+
fn=lambda v: self.app.update_ui_state(auto_resume=v),
|
394 |
+
inputs=[self.components["auto_resume_checkbox"]],
|
395 |
+
outputs=[]
|
396 |
+
)
|
397 |
+
|
398 |
# Add in the connect_events() method:
|
399 |
self.components["num_gpus"].change(
|
400 |
fn=lambda v: self.app.update_ui_state(num_gpus=v),
|