|
import logging |
|
import time |
|
import torch |
|
import psutil |
|
import GPUtil |
|
from datetime import datetime, timedelta |
|
import gc |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class EnsembleMonitorAgent: |
|
def __init__(self): |
|
logger.info("Initializing EnsembleMonitorAgent.") |
|
self.performance_metrics = {} |
|
self.alerts = [] |
|
|
|
def monitor_prediction(self, model_id, prediction_label, confidence_score, inference_time): |
|
logger.info(f"Monitoring prediction for model '{model_id}'. Label: {prediction_label}, Confidence: {confidence_score:.2f}, Time: {inference_time:.4f}s") |
|
if model_id not in self.performance_metrics: |
|
self.performance_metrics[model_id] = { |
|
"total_predictions": 0, |
|
"correct_predictions": 0, |
|
"total_confidence": 0.0, |
|
"total_inference_time": 0.0 |
|
} |
|
|
|
metrics = self.performance_metrics[model_id] |
|
metrics["total_predictions"] += 1 |
|
metrics["total_confidence"] += confidence_score |
|
metrics["total_inference_time"] += inference_time |
|
|
|
|
|
if inference_time > 5.0: |
|
alert_msg = f"ALERT: Model '{model_id}' inference time exceeded 5.0s: {inference_time:.4f}s" |
|
self.alerts.append(alert_msg) |
|
logger.warning(alert_msg) |
|
|
|
|
|
if confidence_score < 0.5: |
|
alert_msg = f"ALERT: Model '{model_id}' returned low confidence: {confidence_score:.2f}" |
|
self.alerts.append(alert_msg) |
|
logger.warning(alert_msg) |
|
|
|
logger.info(f"Updated metrics for '{model_id}': {metrics}") |
|
|
|
def get_performance_summary(self): |
|
logger.info("Generating performance summary for all models.") |
|
summary = {} |
|
for model_id, metrics in self.performance_metrics.items(): |
|
avg_confidence = metrics["total_confidence"] / metrics["total_predictions"] if metrics["total_predictions"] > 0 else 0 |
|
avg_inference_time = metrics["total_inference_time"] / metrics["total_predictions"] if metrics["total_predictions"] > 0 else 0 |
|
summary[model_id] = { |
|
"avg_confidence": avg_confidence, |
|
"avg_inference_time": avg_inference_time, |
|
"total_predictions": metrics["total_predictions"] |
|
} |
|
logger.info(f"Performance summary: {summary}") |
|
return summary |
|
|
|
class WeightOptimizationAgent: |
|
def __init__(self, weight_manager): |
|
logger.info("Initializing WeightOptimizationAgent.") |
|
self.weight_manager = weight_manager |
|
self.prediction_history = [] |
|
self.performance_window = timedelta(hours=24) |
|
|
|
def analyze_performance(self, final_prediction, ground_truth=None): |
|
logger.info(f"Analyzing performance. Final prediction: {final_prediction}, Ground truth: {ground_truth}") |
|
timestamp = datetime.now() |
|
self.prediction_history.append({ |
|
"timestamp": timestamp, |
|
"final_prediction": final_prediction, |
|
"ground_truth": ground_truth |
|
}) |
|
|
|
|
|
self.prediction_history = [p for p in self.prediction_history if timestamp - p["timestamp"] < self.performance_window] |
|
logger.info(f"Prediction history length: {len(self.prediction_history)}") |
|
|
|
|
|
|
|
|
|
class SystemHealthAgent: |
|
def __init__(self): |
|
logger.info("Initializing SystemHealthAgent.") |
|
self.health_metrics = { |
|
"cpu_percent": 0, |
|
"memory_usage": {"total": 0, "available": 0, "percent": 0}, |
|
"gpu_utilization": [] |
|
} |
|
|
|
def monitor_system_health(self): |
|
logger.info("Monitoring system health...") |
|
self.health_metrics["cpu_percent"] = psutil.cpu_percent(interval=1) |
|
mem = psutil.virtual_memory() |
|
self.health_metrics["memory_usage"] = { |
|
"total": mem.total, |
|
"available": mem.available, |
|
"percent": mem.percent |
|
} |
|
|
|
|
|
if mem.percent > 90: |
|
logger.warning(f"CRITICAL: System memory usage is at {mem.percent}%. Attempting to clear memory cache...") |
|
gc.collect() |
|
logger.info("Garbage collection triggered. Re-checking memory usage...") |
|
mem_after_gc = psutil.virtual_memory() |
|
self.health_metrics["memory_usage_after_gc"] = { |
|
"total": mem_after_gc.total, |
|
"available": mem_after_gc.available, |
|
"percent": mem_after_gc.percent |
|
} |
|
logger.info(f"Memory usage after GC: {mem_after_gc.percent}%") |
|
|
|
gpu_info = [] |
|
try: |
|
gpus = GPUtil.getGPUs() |
|
for gpu in gpus: |
|
gpu_info.append({ |
|
"id": gpu.id, |
|
"name": gpu.name, |
|
"load": gpu.load, |
|
"memoryUtil": gpu.memoryUtil, |
|
"memoryTotal": gpu.memoryTotal, |
|
"memoryUsed": gpu.memoryUsed |
|
}) |
|
except Exception as e: |
|
logger.warning(f"Could not retrieve GPU information: {e}") |
|
gpu_info.append({"error": str(e)}) |
|
self.health_metrics["gpu_utilization"] = gpu_info |
|
logger.info(f"System health metrics: CPU: {self.health_metrics['cpu_percent']}%, Memory: {self.health_metrics['memory_usage']['percent']}%, GPU: {gpu_info}") |