LPX55 commited on
Commit
b26b103
·
verified ·
1 Parent(s): 737fdca

hot-fix: memory (#2)

Browse files

- hot-fix: memory (20a104276ba75f961781805af2e5c8a25f2a0f0f)

Files changed (1) hide show
  1. agents/ensemble_team.py +16 -2
agents/ensemble_team.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  import psutil # Ensure psutil is imported here as well
5
  import GPUtil
6
  from datetime import datetime, timedelta
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
@@ -40,7 +41,7 @@ class EnsembleMonitorAgent:
40
  self.alerts.append(alert_msg)
41
  logger.warning(alert_msg)
42
 
43
- logger.debug(f"Updated metrics for '{model_id}': {metrics}")
44
 
45
  def get_performance_summary(self):
46
  logger.info("Generating performance summary for all models.")
@@ -74,7 +75,7 @@ class WeightOptimizationAgent:
74
 
75
  # Keep history windowed
76
  self.prediction_history = [p for p in self.prediction_history if timestamp - p["timestamp"] < self.performance_window]
77
- logger.debug(f"Prediction history length: {len(self.prediction_history)}")
78
 
79
  # In a real scenario, this would involve a more complex optimization logic
80
  # For now, it just logs the history length.
@@ -98,6 +99,19 @@ class SystemHealthAgent:
98
  "percent": mem.percent
99
  }
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  gpu_info = []
102
  try:
103
  gpus = GPUtil.getGPUs()
 
4
  import psutil # Ensure psutil is imported here as well
5
  import GPUtil
6
  from datetime import datetime, timedelta
7
+ import gc # Import garbage collector
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
41
  self.alerts.append(alert_msg)
42
  logger.warning(alert_msg)
43
 
44
+ logger.info(f"Updated metrics for '{model_id}': {metrics}")
45
 
46
  def get_performance_summary(self):
47
  logger.info("Generating performance summary for all models.")
 
75
 
76
  # Keep history windowed
77
  self.prediction_history = [p for p in self.prediction_history if timestamp - p["timestamp"] < self.performance_window]
78
+ logger.info(f"Prediction history length: {len(self.prediction_history)}")
79
 
80
  # In a real scenario, this would involve a more complex optimization logic
81
  # For now, it just logs the history length.
 
99
  "percent": mem.percent
100
  }
101
 
102
+ # Holy moly, been at 99% for hours whoops
103
+ if mem.percent > 90:
104
+ logger.warning(f"CRITICAL: System memory usage is at {mem.percent}%. Attempting to clear memory cache...")
105
+ gc.collect()
106
+ logger.info("Garbage collection triggered. Re-checking memory usage...")
107
+ mem_after_gc = psutil.virtual_memory()
108
+ self.health_metrics["memory_usage_after_gc"] = {
109
+ "total": mem_after_gc.total,
110
+ "available": mem_after_gc.available,
111
+ "percent": mem_after_gc.percent
112
+ }
113
+ logger.info(f"Memory usage after GC: {mem_after_gc.percent}%")
114
+
115
  gpu_info = []
116
  try:
117
  gpus = GPUtil.getGPUs()