File size: 981 Bytes
02c1ae0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# health_check.py
import psutil
from dataclasses import dataclass
from typing import Dict, Any

@dataclass
class HealthStatus:
    status: str
    gpu_memory: Dict[str, float]
    cpu_usage: float
    ram_usage: float
    model_status: Dict[str, str]

class HealthCheck:
    @staticmethod
    def check_gpu_memory() -> Dict[str, float]:
        if torch.cuda.is_available():
            return {
                f"gpu_{i}": torch.cuda.memory_allocated(i) / 1024**3
                for i in range(torch.cuda.device_count())
            }
        return {}

    @staticmethod
    def check_system_resources() -> HealthStatus:
        return HealthStatus(
            status="healthy",
            gpu_memory=HealthCheck.check_gpu_memory(),
            cpu_usage=psutil.cpu_percent(),
            ram_usage=psutil.virtual_memory().percent,
            #TODO add more system resources like disk, network, etc.
            model_status={}  # To be filled by the model manager
        )