Spaces:
Running
Running
""" | |
System monitoring service for Video Model Studio. | |
Tracks system resources like CPU, memory, and other metrics. | |
""" | |
import os | |
import time | |
import logging | |
import platform | |
import threading | |
from datetime import datetime, timedelta | |
from collections import deque | |
from typing import Dict, List, Optional, Tuple, Any | |
import psutil | |
# Force the use of the Agg backend which is thread-safe | |
import matplotlib | |
matplotlib.use('Agg') # Must be before importing pyplot | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from vms.ui.monitoring.services.gpu import GPUMonitoringService | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
class MonitoringService: | |
"""Service for monitoring system resources and performance""" | |
def __init__(self, history_minutes: int = 10, sample_interval: int = 5): | |
"""Initialize the monitoring service | |
Args: | |
history_minutes: How many minutes of history to keep | |
sample_interval: How many seconds between samples | |
""" | |
self.history_minutes = history_minutes | |
self.sample_interval = sample_interval | |
self.max_samples = (history_minutes * 60) // sample_interval | |
# Initialize data structures for metrics | |
self.timestamps = deque(maxlen=self.max_samples) | |
self.cpu_percent = deque(maxlen=self.max_samples) | |
self.memory_percent = deque(maxlen=self.max_samples) | |
self.memory_used = deque(maxlen=self.max_samples) | |
self.memory_available = deque(maxlen=self.max_samples) | |
# CPU temperature history (might not be available on all systems) | |
self.cpu_temp = deque(maxlen=self.max_samples) | |
# Per-core CPU history | |
self.cpu_cores_percent = {} | |
# Initialize GPU monitoring service | |
self.gpu = GPUMonitoringService(history_minutes=history_minutes, sample_interval=sample_interval) | |
# Track if the monitoring thread is running | |
self.is_running = False | |
self.thread = None | |
# Initialize with current values | |
self.collect_metrics() | |
def collect_metrics(self) -> Dict[str, Any]: | |
"""Collect current system metrics | |
Returns: | |
Dictionary of current metrics | |
""" | |
metrics = { | |
'timestamp': datetime.now(), | |
'cpu_percent': psutil.cpu_percent(interval=0.1), | |
'memory_percent': psutil.virtual_memory().percent, | |
'memory_used': psutil.virtual_memory().used / (1024**3), # GB | |
'memory_available': psutil.virtual_memory().available / (1024**3), # GB | |
'cpu_temp': None, | |
'per_cpu_percent': psutil.cpu_percent(interval=0.1, percpu=True) | |
} | |
# Try to get CPU temperature (platform specific) | |
try: | |
if platform.system() == 'Linux': | |
# Try to get temperature from psutil | |
temps = psutil.sensors_temperatures() | |
for name, entries in temps.items(): | |
if name.startswith(('coretemp', 'k10temp', 'cpu_thermal')): | |
metrics['cpu_temp'] = entries[0].current | |
break | |
elif platform.system() == 'Darwin': # macOS | |
# On macOS, we could use SMC reader but it requires additional dependencies | |
# Leaving as None for now | |
pass | |
elif platform.system() == 'Windows': | |
# Windows might require WMI, leaving as None for simplicity | |
pass | |
except (AttributeError, KeyError, IndexError, NotImplementedError): | |
# Sensors not available | |
pass | |
return metrics | |
def update_history(self, metrics: Dict[str, Any]) -> None: | |
"""Update metric history with new values | |
Args: | |
metrics: New metrics to add to history | |
""" | |
self.timestamps.append(metrics['timestamp']) | |
self.cpu_percent.append(metrics['cpu_percent']) | |
self.memory_percent.append(metrics['memory_percent']) | |
self.memory_used.append(metrics['memory_used']) | |
self.memory_available.append(metrics['memory_available']) | |
if metrics['cpu_temp'] is not None: | |
self.cpu_temp.append(metrics['cpu_temp']) | |
# Update per-core CPU metrics | |
for i, percent in enumerate(metrics['per_cpu_percent']): | |
if i not in self.cpu_cores_percent: | |
self.cpu_cores_percent[i] = deque(maxlen=self.max_samples) | |
self.cpu_cores_percent[i].append(percent) | |
def start_monitoring(self) -> None: | |
"""Start background thread for collecting metrics""" | |
if self.is_running: | |
logger.warning("Monitoring thread already running") | |
return | |
self.is_running = True | |
# Start GPU monitoring if available | |
self.gpu.start_monitoring() | |
def _monitor_loop(): | |
while self.is_running: | |
try: | |
metrics = self.collect_metrics() | |
self.update_history(metrics) | |
time.sleep(self.sample_interval) | |
except Exception as e: | |
logger.error(f"Error in monitoring thread: {str(e)}", exc_info=True) | |
time.sleep(self.sample_interval) | |
self.thread = threading.Thread(target=_monitor_loop, daemon=True) | |
self.thread.start() | |
logger.info("System monitoring thread started") | |
def stop_monitoring(self) -> None: | |
"""Stop the monitoring thread""" | |
if not self.is_running: | |
return | |
self.is_running = False | |
# Stop GPU monitoring | |
self.gpu.stop_monitoring() | |
if self.thread: | |
self.thread.join(timeout=1.0) | |
logger.info("System monitoring thread stopped") | |
def get_current_metrics(self) -> Dict[str, Any]: | |
"""Get current system metrics | |
Returns: | |
Dictionary with current system metrics | |
""" | |
return self.collect_metrics() | |
def get_system_info(self) -> Dict[str, Any]: | |
"""Get general system information | |
Returns: | |
Dictionary with system details | |
""" | |
cpu_info = { | |
'cores_physical': psutil.cpu_count(logical=False), | |
'cores_logical': psutil.cpu_count(logical=True), | |
'current_frequency': None, | |
'architecture': platform.machine(), | |
} | |
# Try to get CPU frequency | |
try: | |
cpu_freq = psutil.cpu_freq() | |
if cpu_freq: | |
cpu_info['current_frequency'] = cpu_freq.current | |
except Exception: | |
pass | |
memory_info = { | |
'total': psutil.virtual_memory().total / (1024**3), # GB | |
'available': psutil.virtual_memory().available / (1024**3), # GB | |
'used': psutil.virtual_memory().used / (1024**3), # GB | |
'percent': psutil.virtual_memory().percent | |
} | |
disk_info = {} | |
for part in psutil.disk_partitions(all=False): | |
if os.name == 'nt' and ('cdrom' in part.opts or part.fstype == ''): | |
# Skip CD-ROM drives on Windows | |
continue | |
try: | |
usage = psutil.disk_usage(part.mountpoint) | |
disk_info[part.mountpoint] = { | |
'total': usage.total / (1024**3), # GB | |
'used': usage.used / (1024**3), # GB | |
'free': usage.free / (1024**3), # GB | |
'percent': usage.percent | |
} | |
except PermissionError: | |
continue | |
sys_info = { | |
'system': platform.system(), | |
'version': platform.version(), | |
'platform': platform.platform(), | |
'processor': platform.processor(), | |
'hostname': platform.node(), | |
'python_version': platform.python_version(), | |
'uptime': time.time() - psutil.boot_time() | |
} | |
return { | |
'cpu': cpu_info, | |
'memory': memory_info, | |
'disk': disk_info, | |
'system': sys_info, | |
} | |
def generate_cpu_plot(self) -> plt.Figure: | |
"""Generate a plot of CPU usage over time | |
Returns: | |
Matplotlib figure with CPU usage plot | |
""" | |
plt.close('all') # Close all existing figures | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
if not self.timestamps: | |
ax.set_title("No CPU data available yet") | |
return fig | |
x = [t.strftime('%H:%M:%S') for t in self.timestamps] | |
if len(x) > 10: | |
# Show fewer x-axis labels for readability | |
step = len(x) // 10 | |
ax.set_xticks(range(0, len(x), step)) | |
ax.set_xticklabels([x[i] for i in range(0, len(x), step)]) | |
ax.plot(x, list(self.cpu_percent), 'b-', label='CPU Usage %') | |
if self.cpu_temp and len(self.cpu_temp) > 0: | |
# Plot temperature on a secondary y-axis if available | |
ax2 = ax.twinx() | |
ax2.plot(x[:len(self.cpu_temp)], list(self.cpu_temp), 'r-', label='CPU Temp °C') | |
ax2.set_ylabel('Temperature (°C)', color='r') | |
ax2.tick_params(axis='y', colors='r') | |
ax.set_title('CPU Usage Over Time') | |
ax.set_xlabel('Time') | |
ax.set_ylabel('Usage %') | |
ax.grid(True, alpha=0.3) | |
ax.set_ylim(0, 100) | |
# Add legend | |
lines, labels = ax.get_legend_handles_labels() | |
if hasattr(locals(), 'ax2'): | |
lines2, labels2 = ax2.get_legend_handles_labels() | |
ax.legend(lines + lines2, labels + labels2, loc='upper left') | |
else: | |
ax.legend(loc='upper left') | |
plt.tight_layout() | |
return fig | |
def generate_memory_plot(self) -> plt.Figure: | |
"""Generate a plot of memory usage over time | |
Returns: | |
Matplotlib figure with memory usage plot | |
""" | |
plt.close('all') # Close all existing figures | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
if not self.timestamps: | |
ax.set_title("No memory data available yet") | |
return fig | |
x = [t.strftime('%H:%M:%S') for t in self.timestamps] | |
if len(x) > 10: | |
# Show fewer x-axis labels for readability | |
step = len(x) // 10 | |
ax.set_xticks(range(0, len(x), step)) | |
ax.set_xticklabels([x[i] for i in range(0, len(x), step)]) | |
ax.plot(x, list(self.memory_percent), 'g-', label='Memory Usage %') | |
# Add secondary y-axis for absolute memory values | |
ax2 = ax.twinx() | |
ax2.plot(x, list(self.memory_used), 'm--', label='Used (GB)') | |
ax2.plot(x, list(self.memory_available), 'c--', label='Available (GB)') | |
ax2.set_ylabel('Memory (GB)') | |
ax.set_title('Memory Usage Over Time') | |
ax.set_xlabel('Time') | |
ax.set_ylabel('Usage %') | |
ax.grid(True, alpha=0.3) | |
ax.set_ylim(0, 100) | |
# Add legend | |
lines, labels = ax.get_legend_handles_labels() | |
lines2, labels2 = ax2.get_legend_handles_labels() | |
ax.legend(lines + lines2, labels + labels2, loc='upper left') | |
plt.tight_layout() | |
return fig | |
def generate_per_core_plot(self) -> plt.Figure: | |
"""Generate a plot of per-core CPU usage | |
Returns: | |
Matplotlib figure with per-core CPU usage | |
""" | |
num_cores = len(self.cpu_cores_percent) | |
if num_cores == 0: | |
# No data yet | |
plt.close('all') # Close all existing figures | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
ax.set_title("No per-core CPU data available yet") | |
return fig | |
# Determine grid layout based on number of cores | |
if num_cores <= 4: | |
rows, cols = 2, 2 | |
elif num_cores <= 6: | |
rows, cols = 2, 3 | |
elif num_cores <= 9: | |
rows, cols = 3, 3 | |
elif num_cores <= 12: | |
rows, cols = 3, 4 | |
else: | |
rows, cols = 4, 4 | |
fig, axes = plt.subplots(rows, cols, figsize=(12, 8), sharex=True, sharey=True) | |
axes = axes.flatten() | |
x = [t.strftime('%H:%M:%S') for t in self.timestamps] | |
if len(x) > 5: | |
# Show fewer x-axis labels for readability | |
step = len(x) // 5 | |
else: | |
step = 1 | |
for i, (core_id, percentages) in enumerate(self.cpu_cores_percent.items()): | |
if i >= len(axes): | |
break | |
ax = axes[i] | |
ax.plot(x[:len(percentages)], list(percentages), 'b-') | |
ax.set_title(f'Core {core_id}') | |
ax.set_ylim(0, 100) | |
ax.grid(True, alpha=0.3) | |
# Add x-axis labels sparingly for readability | |
if i >= len(axes) - cols: # Only for bottom row | |
ax.set_xticks(range(0, len(x), step)) | |
ax.set_xticklabels([x[i] for i in range(0, len(x), step)], rotation=45) | |
# Hide unused subplots | |
for i in range(num_cores, len(axes)): | |
axes[i].set_visible(False) | |
plt.tight_layout() | |
return fig |