hot-fix: memory (#2)
Browse files- hot-fix: memory (20a104276ba75f961781805af2e5c8a25f2a0f0f)
- agents/ensemble_team.py +16 -2
agents/ensemble_team.py
CHANGED
@@ -4,6 +4,7 @@ import torch
|
|
4 |
import psutil # Ensure psutil is imported here as well
|
5 |
import GPUtil
|
6 |
from datetime import datetime, timedelta
|
|
|
7 |
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
@@ -40,7 +41,7 @@ class EnsembleMonitorAgent:
|
|
40 |
self.alerts.append(alert_msg)
|
41 |
logger.warning(alert_msg)
|
42 |
|
43 |
-
logger.
|
44 |
|
45 |
def get_performance_summary(self):
|
46 |
logger.info("Generating performance summary for all models.")
|
@@ -74,7 +75,7 @@ class WeightOptimizationAgent:
|
|
74 |
|
75 |
# Keep history windowed
|
76 |
self.prediction_history = [p for p in self.prediction_history if timestamp - p["timestamp"] < self.performance_window]
|
77 |
-
logger.
|
78 |
|
79 |
# In a real scenario, this would involve a more complex optimization logic
|
80 |
# For now, it just logs the history length.
|
@@ -98,6 +99,19 @@ class SystemHealthAgent:
|
|
98 |
"percent": mem.percent
|
99 |
}
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
gpu_info = []
|
102 |
try:
|
103 |
gpus = GPUtil.getGPUs()
|
|
|
4 |
import psutil # Ensure psutil is imported here as well
|
5 |
import GPUtil
|
6 |
from datetime import datetime, timedelta
|
7 |
+
import gc # Import garbage collector
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
|
|
41 |
self.alerts.append(alert_msg)
|
42 |
logger.warning(alert_msg)
|
43 |
|
44 |
+
logger.info(f"Updated metrics for '{model_id}': {metrics}")
|
45 |
|
46 |
def get_performance_summary(self):
|
47 |
logger.info("Generating performance summary for all models.")
|
|
|
75 |
|
76 |
# Keep history windowed
|
77 |
self.prediction_history = [p for p in self.prediction_history if timestamp - p["timestamp"] < self.performance_window]
|
78 |
+
logger.info(f"Prediction history length: {len(self.prediction_history)}")
|
79 |
|
80 |
# In a real scenario, this would involve a more complex optimization logic
|
81 |
# For now, it just logs the history length.
|
|
|
99 |
"percent": mem.percent
|
100 |
}
|
101 |
|
102 |
+
# Holy moly, been at 99% for hours whoops
|
103 |
+
if mem.percent > 90:
|
104 |
+
logger.warning(f"CRITICAL: System memory usage is at {mem.percent}%. Attempting to clear memory cache...")
|
105 |
+
gc.collect()
|
106 |
+
logger.info("Garbage collection triggered. Re-checking memory usage...")
|
107 |
+
mem_after_gc = psutil.virtual_memory()
|
108 |
+
self.health_metrics["memory_usage_after_gc"] = {
|
109 |
+
"total": mem_after_gc.total,
|
110 |
+
"available": mem_after_gc.available,
|
111 |
+
"percent": mem_after_gc.percent
|
112 |
+
}
|
113 |
+
logger.info(f"Memory usage after GC: {mem_after_gc.percent}%")
|
114 |
+
|
115 |
gpu_info = []
|
116 |
try:
|
117 |
gpus = GPUtil.getGPUs()
|