|
"""Performance monitoring and metrics collection for the MCP Hub."""
|
|
|
|
import time
|
|
import psutil
|
|
import threading
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, Optional
|
|
from collections import defaultdict, deque
|
|
from dataclasses import dataclass
|
|
from contextlib import contextmanager
|
|
from .logging_config import logger
|
|
|
|
@dataclass
|
|
class MetricPoint:
|
|
"""Single metric measurement."""
|
|
timestamp: datetime
|
|
metric_name: str
|
|
value: float
|
|
tags: Dict[str, str]
|
|
|
|
class MetricsCollector:
|
|
"""Collects and stores application metrics."""
|
|
|
|
def __init__(self, max_points: int = 10000):
|
|
"""
|
|
Initialize metrics collector.
|
|
|
|
Args:
|
|
max_points: Maximum number of metric points to store
|
|
"""
|
|
self.max_points = max_points
|
|
self.metrics = defaultdict(lambda: deque(maxlen=max_points))
|
|
self.lock = threading.Lock()
|
|
self.counters = defaultdict(int)
|
|
self.timers = {}
|
|
|
|
|
|
self.system_thread = threading.Thread(target=self._collect_system_metrics, daemon=True)
|
|
self.system_thread.start()
|
|
logger.info("Metrics collector initialized")
|
|
|
|
def record_metric(self, name: str, value: float, tags: Optional[Dict[str, str]] = None):
|
|
"""Record a metric value."""
|
|
if tags is None:
|
|
tags = {}
|
|
|
|
point = MetricPoint(
|
|
timestamp=datetime.now(),
|
|
metric_name=name,
|
|
value=value,
|
|
tags=tags
|
|
)
|
|
|
|
with self.lock:
|
|
self.metrics[name].append(point)
|
|
|
|
def increment_counter(self, name: str, amount: int = 1, tags: Optional[Dict[str, str]] = None):
|
|
"""Increment a counter metric."""
|
|
with self.lock:
|
|
self.counters[name] += amount
|
|
|
|
self.record_metric(f"{name}_count", self.counters[name], tags)
|
|
|
|
@contextmanager
|
|
def timer(self, name: str, tags: Optional[Dict[str, str]] = None):
|
|
"""Context manager for timing operations."""
|
|
start_time = time.time()
|
|
try:
|
|
yield
|
|
finally:
|
|
duration = time.time() - start_time
|
|
self.record_metric(f"{name}_duration_seconds", duration, tags)
|
|
|
|
def get_metrics_summary(self,
|
|
metric_name: Optional[str] = None,
|
|
last_minutes: int = 5) -> Dict[str, Any]:
|
|
"""Get summary statistics for metrics."""
|
|
cutoff_time = datetime.now() - timedelta(minutes=last_minutes)
|
|
|
|
with self.lock:
|
|
if metric_name:
|
|
metrics_to_analyze = {metric_name: self.metrics[metric_name]}
|
|
else:
|
|
metrics_to_analyze = dict(self.metrics)
|
|
|
|
summary = {}
|
|
|
|
for name, points in metrics_to_analyze.items():
|
|
recent_points = [p for p in points if p.timestamp >= cutoff_time]
|
|
|
|
if not recent_points:
|
|
continue
|
|
|
|
values = [p.value for p in recent_points]
|
|
summary[name] = {
|
|
"count": len(values),
|
|
"average": sum(values) / len(values),
|
|
"min": min(values),
|
|
"max": max(values),
|
|
"latest": values[-1] if values else 0,
|
|
"last_updated": recent_points[-1].timestamp.isoformat() if recent_points else None
|
|
}
|
|
|
|
return summary
|
|
|
|
def _collect_system_metrics(self):
|
|
"""Background thread to collect system metrics."""
|
|
while True:
|
|
try:
|
|
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
memory = psutil.virtual_memory()
|
|
|
|
self.record_metric("system_cpu_percent", cpu_percent)
|
|
self.record_metric("system_memory_percent", memory.percent)
|
|
self.record_metric("system_memory_available_mb", memory.available / 1024 / 1024)
|
|
|
|
|
|
process = psutil.Process()
|
|
process_memory = process.memory_info()
|
|
|
|
self.record_metric("process_memory_rss_mb", process_memory.rss / 1024 / 1024)
|
|
self.record_metric("process_cpu_percent", process.cpu_percent())
|
|
|
|
time.sleep(30)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting system metrics: {e}")
|
|
time.sleep(60)
|
|
|
|
class PerformanceProfiler:
|
|
"""Profile performance of agent operations."""
|
|
|
|
def __init__(self, metrics_collector: MetricsCollector):
|
|
self.metrics = metrics_collector
|
|
self.operation_stats = defaultdict(list)
|
|
|
|
@contextmanager
|
|
def profile_operation(self, operation_name: str, **tags):
|
|
"""Context manager to profile an operation."""
|
|
start_time = time.time()
|
|
start_memory = psutil.Process().memory_info().rss
|
|
|
|
try:
|
|
yield
|
|
success = True
|
|
except Exception as e:
|
|
success = False
|
|
logger.error(f"Operation {operation_name} failed: {e}")
|
|
raise
|
|
finally:
|
|
end_time = time.time()
|
|
end_memory = psutil.Process().memory_info().rss
|
|
|
|
duration = end_time - start_time
|
|
memory_delta = (end_memory - start_memory) / 1024 / 1024
|
|
|
|
|
|
operation_tags = {"operation": operation_name, "success": str(success), **tags}
|
|
self.metrics.record_metric("operation_duration_seconds", duration, operation_tags)
|
|
self.metrics.record_metric("operation_memory_delta_mb", memory_delta, operation_tags)
|
|
|
|
|
|
self.operation_stats[operation_name].append({
|
|
"duration": duration,
|
|
"memory_delta": memory_delta,
|
|
"success": success,
|
|
"timestamp": datetime.now()
|
|
})
|
|
|
|
def get_operation_summary(self, operation_name: str = None) -> Dict[str, Any]:
|
|
"""Get summary of operation performance."""
|
|
if operation_name:
|
|
operations_to_analyze = {operation_name: self.operation_stats[operation_name]}
|
|
else:
|
|
operations_to_analyze = dict(self.operation_stats)
|
|
|
|
summary = {}
|
|
|
|
for op_name, stats in operations_to_analyze.items():
|
|
if not stats:
|
|
continue
|
|
|
|
durations = [s["duration"] for s in stats]
|
|
memory_deltas = [s["memory_delta"] for s in stats]
|
|
success_rate = sum(1 for s in stats if s["success"]) / len(stats)
|
|
|
|
summary[op_name] = {
|
|
"total_calls": len(stats),
|
|
"success_rate": success_rate,
|
|
"avg_duration_seconds": sum(durations) / len(durations),
|
|
"avg_memory_delta_mb": sum(memory_deltas) / len(memory_deltas),
|
|
"min_duration": min(durations),
|
|
"max_duration": max(durations)
|
|
}
|
|
|
|
return summary
|
|
|
|
|
|
metrics_collector = MetricsCollector()
|
|
performance_profiler = PerformanceProfiler(metrics_collector)
|
|
|
|
|
|
def track_performance(operation_name: str = None):
|
|
"""Decorator to automatically track function performance."""
|
|
def decorator(func):
|
|
nonlocal operation_name
|
|
if operation_name is None:
|
|
operation_name = f"{func.__module__}.{func.__name__}"
|
|
|
|
def wrapper(*args, **kwargs):
|
|
with performance_profiler.profile_operation(operation_name):
|
|
result = func(*args, **kwargs)
|
|
metrics_collector.increment_counter(f"{operation_name}_calls")
|
|
return result
|
|
return wrapper
|
|
return decorator
|
|
|
|
def track_api_call(service_name: str):
|
|
"""Decorator specifically for tracking API calls."""
|
|
def decorator(func):
|
|
def wrapper(*args, **kwargs):
|
|
with performance_profiler.profile_operation("api_call", service=service_name):
|
|
try:
|
|
result = func(*args, **kwargs)
|
|
metrics_collector.increment_counter("api_calls_success", tags={"service": service_name})
|
|
return result
|
|
except Exception:
|
|
metrics_collector.increment_counter("api_calls_failed", tags={"service": service_name})
|
|
raise
|
|
return wrapper
|
|
return decorator
|
|
|