File size: 8,987 Bytes
df2b222 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
"""Performance monitoring and metrics collection for the MCP Hub."""
import time
import psutil
import threading
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
from collections import defaultdict, deque
from dataclasses import dataclass
from contextlib import contextmanager
from .logging_config import logger
@dataclass
class MetricPoint:
"""Single metric measurement."""
timestamp: datetime
metric_name: str
value: float
tags: Dict[str, str]
class MetricsCollector:
"""Collects and stores application metrics."""
def __init__(self, max_points: int = 10000):
"""
Initialize metrics collector.
Args:
max_points: Maximum number of metric points to store
"""
self.max_points = max_points
self.metrics = defaultdict(lambda: deque(maxlen=max_points))
self.lock = threading.Lock()
self.counters = defaultdict(int)
self.timers = {}
# Start system metrics collection thread
self.system_thread = threading.Thread(target=self._collect_system_metrics, daemon=True)
self.system_thread.start()
logger.info("Metrics collector initialized")
def record_metric(self, name: str, value: float, tags: Optional[Dict[str, str]] = None):
"""Record a metric value."""
if tags is None:
tags = {}
point = MetricPoint(
timestamp=datetime.now(),
metric_name=name,
value=value,
tags=tags
)
with self.lock:
self.metrics[name].append(point)
def increment_counter(self, name: str, amount: int = 1, tags: Optional[Dict[str, str]] = None):
"""Increment a counter metric."""
with self.lock:
self.counters[name] += amount
self.record_metric(f"{name}_count", self.counters[name], tags)
@contextmanager
def timer(self, name: str, tags: Optional[Dict[str, str]] = None):
"""Context manager for timing operations."""
start_time = time.time()
try:
yield
finally:
duration = time.time() - start_time
self.record_metric(f"{name}_duration_seconds", duration, tags)
def get_metrics_summary(self,
metric_name: Optional[str] = None,
last_minutes: int = 5) -> Dict[str, Any]:
"""Get summary statistics for metrics."""
cutoff_time = datetime.now() - timedelta(minutes=last_minutes)
with self.lock:
if metric_name:
metrics_to_analyze = {metric_name: self.metrics[metric_name]}
else:
metrics_to_analyze = dict(self.metrics)
summary = {}
for name, points in metrics_to_analyze.items():
recent_points = [p for p in points if p.timestamp >= cutoff_time]
if not recent_points:
continue
values = [p.value for p in recent_points]
summary[name] = {
"count": len(values),
"average": sum(values) / len(values),
"min": min(values),
"max": max(values),
"latest": values[-1] if values else 0,
"last_updated": recent_points[-1].timestamp.isoformat() if recent_points else None
}
return summary
def _collect_system_metrics(self):
"""Background thread to collect system metrics."""
while True:
try:
# CPU and memory metrics
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
self.record_metric("system_cpu_percent", cpu_percent)
self.record_metric("system_memory_percent", memory.percent)
self.record_metric("system_memory_available_mb", memory.available / 1024 / 1024)
# Process-specific metrics
process = psutil.Process()
process_memory = process.memory_info()
self.record_metric("process_memory_rss_mb", process_memory.rss / 1024 / 1024)
self.record_metric("process_cpu_percent", process.cpu_percent())
time.sleep(30) # Collect every 30 seconds
except Exception as e:
logger.error(f"Error collecting system metrics: {e}")
time.sleep(60) # Wait longer if there's an error
class PerformanceProfiler:
"""Profile performance of agent operations."""
def __init__(self, metrics_collector: MetricsCollector):
self.metrics = metrics_collector
self.operation_stats = defaultdict(list)
@contextmanager
def profile_operation(self, operation_name: str, **tags):
"""Context manager to profile an operation."""
start_time = time.time()
start_memory = psutil.Process().memory_info().rss
try:
yield
success = True
except Exception as e:
success = False
logger.error(f"Operation {operation_name} failed: {e}")
raise
finally:
end_time = time.time()
end_memory = psutil.Process().memory_info().rss
duration = end_time - start_time
memory_delta = (end_memory - start_memory) / 1024 / 1024 # MB
# Record metrics
operation_tags = {"operation": operation_name, "success": str(success), **tags}
self.metrics.record_metric("operation_duration_seconds", duration, operation_tags)
self.metrics.record_metric("operation_memory_delta_mb", memory_delta, operation_tags)
# Update operation stats
self.operation_stats[operation_name].append({
"duration": duration,
"memory_delta": memory_delta,
"success": success,
"timestamp": datetime.now()
})
def get_operation_summary(self, operation_name: str = None) -> Dict[str, Any]:
"""Get summary of operation performance."""
if operation_name:
operations_to_analyze = {operation_name: self.operation_stats[operation_name]}
else:
operations_to_analyze = dict(self.operation_stats)
summary = {}
for op_name, stats in operations_to_analyze.items():
if not stats:
continue
durations = [s["duration"] for s in stats]
memory_deltas = [s["memory_delta"] for s in stats]
success_rate = sum(1 for s in stats if s["success"]) / len(stats)
summary[op_name] = {
"total_calls": len(stats),
"success_rate": success_rate,
"avg_duration_seconds": sum(durations) / len(durations),
"avg_memory_delta_mb": sum(memory_deltas) / len(memory_deltas),
"min_duration": min(durations),
"max_duration": max(durations)
}
return summary
# Global instances
metrics_collector = MetricsCollector()
performance_profiler = PerformanceProfiler(metrics_collector)
# Convenience decorators
def track_performance(operation_name: str = None):
"""Decorator to automatically track function performance."""
def decorator(func):
nonlocal operation_name
if operation_name is None:
operation_name = f"{func.__module__}.{func.__name__}"
def wrapper(*args, **kwargs):
with performance_profiler.profile_operation(operation_name):
result = func(*args, **kwargs)
metrics_collector.increment_counter(f"{operation_name}_calls")
return result
return wrapper
return decorator
def track_api_call(service_name: str):
"""Decorator specifically for tracking API calls."""
def decorator(func):
def wrapper(*args, **kwargs):
with performance_profiler.profile_operation("api_call", service=service_name):
try:
result = func(*args, **kwargs)
metrics_collector.increment_counter("api_calls_success", tags={"service": service_name})
return result
except Exception:
metrics_collector.increment_counter("api_calls_failed", tags={"service": service_name})
raise
return wrapper
return decorator
|