ShallowCodeResearch / mcp_hub /performance_monitoring.py
HallD's picture
Upload 60 files
df2b222 verified
"""Performance monitoring and metrics collection for the MCP Hub."""
import time
import psutil
import threading
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
from collections import defaultdict, deque
from dataclasses import dataclass
from contextlib import contextmanager
from .logging_config import logger
@dataclass
class MetricPoint:
"""Single metric measurement."""
timestamp: datetime
metric_name: str
value: float
tags: Dict[str, str]
class MetricsCollector:
"""Collects and stores application metrics."""
def __init__(self, max_points: int = 10000):
"""
Initialize metrics collector.
Args:
max_points: Maximum number of metric points to store
"""
self.max_points = max_points
self.metrics = defaultdict(lambda: deque(maxlen=max_points))
self.lock = threading.Lock()
self.counters = defaultdict(int)
self.timers = {}
# Start system metrics collection thread
self.system_thread = threading.Thread(target=self._collect_system_metrics, daemon=True)
self.system_thread.start()
logger.info("Metrics collector initialized")
def record_metric(self, name: str, value: float, tags: Optional[Dict[str, str]] = None):
"""Record a metric value."""
if tags is None:
tags = {}
point = MetricPoint(
timestamp=datetime.now(),
metric_name=name,
value=value,
tags=tags
)
with self.lock:
self.metrics[name].append(point)
def increment_counter(self, name: str, amount: int = 1, tags: Optional[Dict[str, str]] = None):
"""Increment a counter metric."""
with self.lock:
self.counters[name] += amount
self.record_metric(f"{name}_count", self.counters[name], tags)
@contextmanager
def timer(self, name: str, tags: Optional[Dict[str, str]] = None):
"""Context manager for timing operations."""
start_time = time.time()
try:
yield
finally:
duration = time.time() - start_time
self.record_metric(f"{name}_duration_seconds", duration, tags)
def get_metrics_summary(self,
metric_name: Optional[str] = None,
last_minutes: int = 5) -> Dict[str, Any]:
"""Get summary statistics for metrics."""
cutoff_time = datetime.now() - timedelta(minutes=last_minutes)
with self.lock:
if metric_name:
metrics_to_analyze = {metric_name: self.metrics[metric_name]}
else:
metrics_to_analyze = dict(self.metrics)
summary = {}
for name, points in metrics_to_analyze.items():
recent_points = [p for p in points if p.timestamp >= cutoff_time]
if not recent_points:
continue
values = [p.value for p in recent_points]
summary[name] = {
"count": len(values),
"average": sum(values) / len(values),
"min": min(values),
"max": max(values),
"latest": values[-1] if values else 0,
"last_updated": recent_points[-1].timestamp.isoformat() if recent_points else None
}
return summary
def _collect_system_metrics(self):
"""Background thread to collect system metrics."""
while True:
try:
# CPU and memory metrics
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
self.record_metric("system_cpu_percent", cpu_percent)
self.record_metric("system_memory_percent", memory.percent)
self.record_metric("system_memory_available_mb", memory.available / 1024 / 1024)
# Process-specific metrics
process = psutil.Process()
process_memory = process.memory_info()
self.record_metric("process_memory_rss_mb", process_memory.rss / 1024 / 1024)
self.record_metric("process_cpu_percent", process.cpu_percent())
time.sleep(30) # Collect every 30 seconds
except Exception as e:
logger.error(f"Error collecting system metrics: {e}")
time.sleep(60) # Wait longer if there's an error
class PerformanceProfiler:
"""Profile performance of agent operations."""
def __init__(self, metrics_collector: MetricsCollector):
self.metrics = metrics_collector
self.operation_stats = defaultdict(list)
@contextmanager
def profile_operation(self, operation_name: str, **tags):
"""Context manager to profile an operation."""
start_time = time.time()
start_memory = psutil.Process().memory_info().rss
try:
yield
success = True
except Exception as e:
success = False
logger.error(f"Operation {operation_name} failed: {e}")
raise
finally:
end_time = time.time()
end_memory = psutil.Process().memory_info().rss
duration = end_time - start_time
memory_delta = (end_memory - start_memory) / 1024 / 1024 # MB
# Record metrics
operation_tags = {"operation": operation_name, "success": str(success), **tags}
self.metrics.record_metric("operation_duration_seconds", duration, operation_tags)
self.metrics.record_metric("operation_memory_delta_mb", memory_delta, operation_tags)
# Update operation stats
self.operation_stats[operation_name].append({
"duration": duration,
"memory_delta": memory_delta,
"success": success,
"timestamp": datetime.now()
})
def get_operation_summary(self, operation_name: str = None) -> Dict[str, Any]:
"""Get summary of operation performance."""
if operation_name:
operations_to_analyze = {operation_name: self.operation_stats[operation_name]}
else:
operations_to_analyze = dict(self.operation_stats)
summary = {}
for op_name, stats in operations_to_analyze.items():
if not stats:
continue
durations = [s["duration"] for s in stats]
memory_deltas = [s["memory_delta"] for s in stats]
success_rate = sum(1 for s in stats if s["success"]) / len(stats)
summary[op_name] = {
"total_calls": len(stats),
"success_rate": success_rate,
"avg_duration_seconds": sum(durations) / len(durations),
"avg_memory_delta_mb": sum(memory_deltas) / len(memory_deltas),
"min_duration": min(durations),
"max_duration": max(durations)
}
return summary
# Global instances
metrics_collector = MetricsCollector()
performance_profiler = PerformanceProfiler(metrics_collector)
# Convenience decorators
def track_performance(operation_name: str = None):
"""Decorator to automatically track function performance."""
def decorator(func):
nonlocal operation_name
if operation_name is None:
operation_name = f"{func.__module__}.{func.__name__}"
def wrapper(*args, **kwargs):
with performance_profiler.profile_operation(operation_name):
result = func(*args, **kwargs)
metrics_collector.increment_counter(f"{operation_name}_calls")
return result
return wrapper
return decorator
def track_api_call(service_name: str):
"""Decorator specifically for tracking API calls."""
def decorator(func):
def wrapper(*args, **kwargs):
with performance_profiler.profile_operation("api_call", service=service_name):
try:
result = func(*args, **kwargs)
metrics_collector.increment_counter("api_calls_success", tags={"service": service_name})
return result
except Exception:
metrics_collector.increment_counter("api_calls_failed", tags={"service": service_name})
raise
return wrapper
return decorator