|
import torch |
|
import pynvml |
|
import comfy.model_management |
|
from ..core import logger |
|
|
|
|
|
|
|
class CGPUInfo: |
|
""" |
|
This class is responsible for getting information from GPU (ONLY). |
|
""" |
|
cuda = False |
|
pynvmlLoaded = False |
|
|
|
|
|
cudaAvailable = False |
|
torchDevice = 'cpu' |
|
cudaDevice = 'cpu' |
|
cudaDevicesFound = 0 |
|
switchGPU = True |
|
switchVRAM = True |
|
switchTemperature = True |
|
gpus = [] |
|
gpusUtilization = [] |
|
gpusVRAM = [] |
|
gpusTemperature = [] |
|
|
|
def __init__(self): |
|
try: |
|
pynvml.nvmlInit() |
|
self.pynvmlLoaded = True |
|
logger.info('Pynvml (Nvidia) initialized.') |
|
except Exception as e: |
|
logger.error('Could not init pynvml (Nvidia).' + str(e)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.anygpuLoaded = self.pynvmlLoaded |
|
|
|
try: |
|
self.torchDevice = comfy.model_management.get_torch_device_name(comfy.model_management.get_torch_device()) |
|
except Exception as e: |
|
logger.error('Could not pick default device.' + str(e)) |
|
|
|
|
|
if 'zluda' in self.torchDevice or 'ZLUDA' in self.torchDevice or 'Zluda' in self.torchDevice: |
|
logger.warn('ZLUDA detected. GPU monitoring will be disabled.') |
|
self.anygpuLoaded = False |
|
|
|
self.pynvmlLoaded = False |
|
|
|
if self.anygpuLoaded and self.deviceGetCount() > 0: |
|
self.cudaDevicesFound = self.deviceGetCount() |
|
|
|
logger.info(f"GPU/s:") |
|
|
|
|
|
|
|
|
|
for deviceIndex in range(self.cudaDevicesFound): |
|
deviceHandle = self.deviceGetHandleByIndex(deviceIndex) |
|
|
|
gpuName = self.deviceGetName(deviceHandle, deviceIndex) |
|
|
|
logger.info(f"{deviceIndex}) {gpuName}") |
|
|
|
self.gpus.append({ |
|
'index': deviceIndex, |
|
'name': gpuName, |
|
}) |
|
|
|
|
|
self.gpusUtilization.append(True) |
|
self.gpusVRAM.append(True) |
|
self.gpusTemperature.append(True) |
|
|
|
self.cuda = True |
|
logger.info(self.systemGetDriverVersion()) |
|
else: |
|
logger.warn('No GPU with CUDA detected.') |
|
|
|
self.cudaDevice = 'cpu' if self.torchDevice == 'cpu' else 'cuda' |
|
self.cudaAvailable = torch.cuda.is_available() |
|
|
|
if self.cuda and self.cudaAvailable and self.torchDevice == 'cpu': |
|
logger.warn('CUDA is available, but torch is using CPU.') |
|
|
|
def getInfo(self): |
|
logger.debug('Getting GPUs info...') |
|
return self.gpus |
|
|
|
def getStatus(self): |
|
|
|
gpuUtilization = -1 |
|
gpuTemperature = -1 |
|
vramUsed = -1 |
|
vramTotal = -1 |
|
vramPercent = -1 |
|
|
|
gpuType = '' |
|
gpus = [] |
|
|
|
if self.cudaDevice == 'cpu': |
|
gpuType = 'cpu' |
|
gpus.append({ |
|
'gpu_utilization': -1, |
|
'gpu_temperature': -1, |
|
'vram_total': -1, |
|
'vram_used': -1, |
|
'vram_used_percent': -1, |
|
}) |
|
else: |
|
gpuType = self.cudaDevice |
|
|
|
if self.anygpuLoaded and self.cuda and self.cudaAvailable: |
|
|
|
|
|
|
|
for deviceIndex in range(self.cudaDevicesFound): |
|
deviceHandle = self.deviceGetHandleByIndex(deviceIndex) |
|
|
|
gpuUtilization = -1 |
|
vramPercent = -1 |
|
vramUsed = -1 |
|
vramTotal = -1 |
|
gpuTemperature = -1 |
|
|
|
|
|
if self.switchGPU and self.gpusUtilization[deviceIndex]: |
|
try: |
|
gpuUtilization = self.deviceGetUtilizationRates(deviceHandle) |
|
except Exception as e: |
|
if str(e) == "Unknown Error": |
|
logger.error('For some reason, pynvml is not working in a laptop with only battery, try to connect and turn on the monitor') |
|
else: |
|
logger.error('Could not get GPU utilization.' + str(e)) |
|
|
|
logger.error('Monitor of GPU is turning off (not on UI!)') |
|
self.switchGPU = False |
|
|
|
|
|
if self.switchVRAM and self.gpusVRAM[deviceIndex]: |
|
|
|
memory = self.deviceGetMemoryInfo(deviceHandle) |
|
vramUsed = memory['used'] |
|
vramTotal = memory['total'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
if vramTotal and vramTotal != 0: |
|
vramPercent = vramUsed / vramTotal * 100 |
|
|
|
|
|
if self.switchTemperature and self.gpusTemperature[deviceIndex]: |
|
try: |
|
gpuTemperature = self.deviceGetTemperature(deviceHandle) |
|
except Exception as e: |
|
logger.error('Could not get GPU temperature. Turning off this feature. ' + str(e)) |
|
self.switchTemperature = False |
|
|
|
gpus.append({ |
|
'gpu_utilization': gpuUtilization, |
|
'gpu_temperature': gpuTemperature, |
|
'vram_total': vramTotal, |
|
'vram_used': vramUsed, |
|
'vram_used_percent': vramPercent, |
|
}) |
|
|
|
return { |
|
'device_type': gpuType, |
|
'gpus': gpus, |
|
} |
|
|
|
def deviceGetCount(self): |
|
if self.pynvmlLoaded: |
|
return pynvml.nvmlDeviceGetCount() |
|
|
|
|
|
else: |
|
return 0 |
|
|
|
def deviceGetHandleByIndex(self, index): |
|
if self.pynvmlLoaded: |
|
return pynvml.nvmlDeviceGetHandleByIndex(index) |
|
|
|
|
|
else: |
|
return 0 |
|
|
|
def deviceGetName(self, deviceHandle, deviceIndex): |
|
if self.pynvmlLoaded: |
|
gpuName = 'Unknown GPU' |
|
|
|
try: |
|
gpuName = pynvml.nvmlDeviceGetName(deviceHandle) |
|
try: |
|
gpuName = gpuName.decode('utf-8', errors='ignore') |
|
except AttributeError as e: |
|
pass |
|
|
|
except UnicodeDecodeError as e: |
|
gpuName = 'Unknown GPU (decoding error)' |
|
print(f"UnicodeDecodeError: {e}") |
|
|
|
return gpuName |
|
|
|
|
|
else: |
|
return '' |
|
|
|
def systemGetDriverVersion(self): |
|
if self.pynvmlLoaded: |
|
return f'NVIDIA Driver: {pynvml.nvmlSystemGetDriverVersion()}' |
|
|
|
|
|
|
|
|
|
else: |
|
return 'Driver unknown' |
|
|
|
def deviceGetUtilizationRates(self, deviceHandle): |
|
if self.pynvmlLoaded: |
|
return pynvml.nvmlDeviceGetUtilizationRates(deviceHandle).gpu |
|
|
|
|
|
else: |
|
return 0 |
|
|
|
def deviceGetMemoryInfo(self, deviceHandle): |
|
if self.pynvmlLoaded: |
|
mem = pynvml.nvmlDeviceGetMemoryInfo(deviceHandle) |
|
return {'total': mem.total, 'used': mem.used} |
|
|
|
|
|
|
|
|
|
else: |
|
return {'total': 1, 'used': 1} |
|
|
|
def deviceGetTemperature(self, deviceHandle): |
|
if self.pynvmlLoaded: |
|
return pynvml.nvmlDeviceGetTemperature(deviceHandle, pynvml.NVML_TEMPERATURE_GPU) |
|
|
|
|
|
|
|
|
|
else: |
|
return 0 |
|
|