File size: 6,204 Bytes
8f7f87a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import logging
import time
import torch
import psutil # Ensure psutil is imported here as well
logger = logging.getLogger(__name__)
class EnsembleMonitorAgent:
def __init__(self):
self.performance_metrics = {
"model_accuracy": {},
"response_times": {},
"confidence_distribution": {},
"consensus_rate": 0.0
}
self.alerts = []
def monitor_prediction(self, model_id, prediction, confidence, response_time):
"""Monitor individual model performance"""
if model_id not in self.performance_metrics["model_accuracy"]:
self.performance_metrics["model_accuracy"][model_id] = []
self.performance_metrics["response_times"][model_id] = []
self.performance_metrics["confidence_distribution"][model_id] = []
self.performance_metrics["response_times"][model_id].append(response_time)
self.performance_metrics["confidence_distribution"][model_id].append(confidence)
# Check for performance issues
self._check_performance_issues(model_id)
def _check_performance_issues(self, model_id):
"""Check for any performance anomalies"""
response_times = self.performance_metrics["response_times"][model_id]
if len(response_times) > 10:
avg_time = sum(response_times[-10:]) / 10
if avg_time > 2.0: # More than 2 seconds
self.alerts.append(f"High latency detected for {model_id}: {avg_time:.2f}s")
class WeightOptimizationAgent:
def __init__(self, weight_manager):
self.weight_manager = weight_manager
self.prediction_history = [] # Stores (ensemble_prediction_label, assumed_actual_label)
self.optimization_threshold = 0.05 # 5% change in accuracy triggers optimization
self.min_history_for_optimization = 20 # Minimum samples before optimizing
def analyze_performance(self, ensemble_prediction_label, actual_label=None):
"""Analyze ensemble performance and record for optimization"""
# If actual_label is not provided, assume ensemble is correct if not UNCERTAIN
assumed_actual_label = actual_label
if assumed_actual_label is None and ensemble_prediction_label != "UNCERTAIN":
assumed_actual_label = ensemble_prediction_label
self.prediction_history.append((ensemble_prediction_label, assumed_actual_label))
if len(self.prediction_history) >= self.min_history_for_optimization and self._should_optimize():
self._optimize_weights()
def _calculate_accuracy(self, history_subset):
"""Calculates accuracy based on history where actual_label is known."""
correct_predictions = 0
total_known = 0
for ensemble_pred, actual_label in history_subset:
if actual_label is not None:
total_known += 1
if ensemble_pred == actual_label:
correct_predictions += 1
return correct_predictions / total_known if total_known > 0 else 0.0
def _should_optimize(self):
"""Determine if weights should be optimized based on recent performance change."""
if len(self.prediction_history) < self.min_history_for_optimization * 2: # Need enough history for comparison
return False
# Compare accuracy of recent batch with previous batch
recent_batch = self.prediction_history[-self.min_history_for_optimization:]
previous_batch = self.prediction_history[-self.min_history_for_optimization*2:-self.min_history_for_optimization]
recent_accuracy = self._calculate_accuracy(recent_batch)
previous_accuracy = self._calculate_accuracy(previous_batch)
# Trigger optimization if there's a significant drop in accuracy
if previous_accuracy > 0 and (previous_accuracy - recent_accuracy) / previous_accuracy > self.optimization_threshold:
logger.warning(f"Performance degradation detected (from {previous_accuracy:.2f} to {recent_accuracy:.2f}). Triggering weight optimization.")
return True
return False
def _optimize_weights(self):
"""Optimize model weights based on performance."""
logger.info("Optimizing model weights based on recent performance.")
# Placeholder for sophisticated optimization logic.
# This is where you would adjust self.weight_manager.base_weights
# based on which models contributed more to correct predictions or errors.
# For now, it's just a log message.
class SystemHealthAgent:
def __init__(self):
self.health_metrics = {
"memory_usage": [],
"gpu_utilization": [],
"model_load_times": {},
"error_rates": {}
}
def monitor_system_health(self):
"""Monitor overall system health"""
self._check_memory_usage()
self._check_gpu_utilization()
# You might add _check_model_health() here later
def _check_memory_usage(self):
"""Monitor memory usage"""
try:
import psutil
memory = psutil.virtual_memory()
self.health_metrics["memory_usage"].append(memory.percent)
if memory.percent > 90:
logger.warning(f"High memory usage detected: {memory.percent}%")
except ImportError:
logger.warning("psutil not installed. Cannot monitor memory usage.")
def _check_gpu_utilization(self):
"""Monitor GPU utilization if available"""
if torch.cuda.is_available():
try:
gpu_util = torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()
self.health_metrics["gpu_utilization"].append(gpu_util)
if gpu_util > 0.9:
logger.warning(f"High GPU utilization detected: {gpu_util*100:.2f}%")
except Exception as e:
logger.warning(f"Error monitoring GPU utilization: {e}")
else:
logger.info("CUDA not available. Skipping GPU utilization monitoring.") |