import torch import pynvml import comfy.model_management from ..core import logger # from ctypes import * # from pyrsmi import rocml class CGPUInfo: """ This class is responsible for getting information from GPU (ONLY). """ cuda = False pynvmlLoaded = False # pyamdLoaded = False # anygpuLoaded = False cudaAvailable = False torchDevice = 'cpu' cudaDevice = 'cpu' cudaDevicesFound = 0 switchGPU = True switchVRAM = True switchTemperature = True gpus = [] gpusUtilization = [] gpusVRAM = [] gpusTemperature = [] def __init__(self): try: pynvml.nvmlInit() self.pynvmlLoaded = True logger.info('Pynvml (Nvidia) initialized.') except Exception as e: logger.error('Could not init pynvml (Nvidia).' + str(e)) # if not self.pynvmlLoaded: # try: # rocml.smi_initialize() # self.pyamdLoaded = True # logger.info('Pyrsmi (AMD) initialized.') # except Exception as e: # logger.error('Could not init pyrsmi (AMD).' + str(e)) # self.anygpuLoaded = self.pynvmlLoaded or self.pyamdLoaded self.anygpuLoaded = self.pynvmlLoaded try: self.torchDevice = comfy.model_management.get_torch_device_name(comfy.model_management.get_torch_device()) except Exception as e: logger.error('Could not pick default device.' + str(e)) # ZLUDA Check, self.torchDevice has 'ZLUDA' in it. if 'zluda' in self.torchDevice or 'ZLUDA' in self.torchDevice or 'Zluda' in self.torchDevice: logger.warn('ZLUDA detected. GPU monitoring will be disabled.') self.anygpuLoaded = False # self.pyamdLoaded = False self.pynvmlLoaded = False if self.anygpuLoaded and self.deviceGetCount() > 0: self.cudaDevicesFound = self.deviceGetCount() logger.info(f"GPU/s:") # for simulate multiple GPUs (for testing) interchange these comments: # for deviceIndex in range(3): # deviceHandle = pynvml.nvmlDeviceGetHandleByIndex(0) for deviceIndex in range(self.cudaDevicesFound): deviceHandle = self.deviceGetHandleByIndex(deviceIndex) gpuName = self.deviceGetName(deviceHandle, deviceIndex) logger.info(f"{deviceIndex}) {gpuName}") self.gpus.append({ 'index': deviceIndex, 'name': gpuName, }) # same index as gpus, with default values self.gpusUtilization.append(True) self.gpusVRAM.append(True) self.gpusTemperature.append(True) self.cuda = True logger.info(self.systemGetDriverVersion()) else: logger.warn('No GPU with CUDA detected.') self.cudaDevice = 'cpu' if self.torchDevice == 'cpu' else 'cuda' self.cudaAvailable = torch.cuda.is_available() if self.cuda and self.cudaAvailable and self.torchDevice == 'cpu': logger.warn('CUDA is available, but torch is using CPU.') def getInfo(self): logger.debug('Getting GPUs info...') return self.gpus def getStatus(self): # logger.debug('CGPUInfo getStatus') gpuUtilization = -1 gpuTemperature = -1 vramUsed = -1 vramTotal = -1 vramPercent = -1 gpuType = '' gpus = [] if self.cudaDevice == 'cpu': gpuType = 'cpu' gpus.append({ 'gpu_utilization': -1, 'gpu_temperature': -1, 'vram_total': -1, 'vram_used': -1, 'vram_used_percent': -1, }) else: gpuType = self.cudaDevice if self.anygpuLoaded and self.cuda and self.cudaAvailable: # for simulate multiple GPUs (for testing) interchange these comments: # for deviceIndex in range(3): # deviceHandle = self.deviceGetHandleByIndex(0) for deviceIndex in range(self.cudaDevicesFound): deviceHandle = self.deviceGetHandleByIndex(deviceIndex) gpuUtilization = -1 vramPercent = -1 vramUsed = -1 vramTotal = -1 gpuTemperature = -1 # GPU Utilization if self.switchGPU and self.gpusUtilization[deviceIndex]: try: gpuUtilization = self.deviceGetUtilizationRates(deviceHandle) except Exception as e: if str(e) == "Unknown Error": logger.error('For some reason, pynvml is not working in a laptop with only battery, try to connect and turn on the monitor') else: logger.error('Could not get GPU utilization.' + str(e)) logger.error('Monitor of GPU is turning off (not on UI!)') self.switchGPU = False # VRAM if self.switchVRAM and self.gpusVRAM[deviceIndex]: # Torch or pynvml?, pynvml is more accurate with the system, torch is more accurate with comfyUI memory = self.deviceGetMemoryInfo(deviceHandle) vramUsed = memory['used'] vramTotal = memory['total'] # device = torch.device(gpuType) # vramUsed = torch.cuda.memory_allocated(device) # vramTotal = torch.cuda.get_device_properties(device).total_memory # check if vramTotal is not zero or None if vramTotal and vramTotal != 0: vramPercent = vramUsed / vramTotal * 100 # Temperature if self.switchTemperature and self.gpusTemperature[deviceIndex]: try: gpuTemperature = self.deviceGetTemperature(deviceHandle) except Exception as e: logger.error('Could not get GPU temperature. Turning off this feature. ' + str(e)) self.switchTemperature = False gpus.append({ 'gpu_utilization': gpuUtilization, 'gpu_temperature': gpuTemperature, 'vram_total': vramTotal, 'vram_used': vramUsed, 'vram_used_percent': vramPercent, }) return { 'device_type': gpuType, 'gpus': gpus, } def deviceGetCount(self): if self.pynvmlLoaded: return pynvml.nvmlDeviceGetCount() # elif self.pyamdLoaded: # return rocml.smi_get_device_count() else: return 0 def deviceGetHandleByIndex(self, index): if self.pynvmlLoaded: return pynvml.nvmlDeviceGetHandleByIndex(index) # elif self.pyamdLoaded: # return index else: return 0 def deviceGetName(self, deviceHandle, deviceIndex): if self.pynvmlLoaded: gpuName = 'Unknown GPU' try: gpuName = pynvml.nvmlDeviceGetName(deviceHandle) try: gpuName = gpuName.decode('utf-8', errors='ignore') except AttributeError as e: pass except UnicodeDecodeError as e: gpuName = 'Unknown GPU (decoding error)' print(f"UnicodeDecodeError: {e}") return gpuName # elif self.pyamdLoaded: # return rocml.smi_get_device_name(deviceIndex) else: return '' def systemGetDriverVersion(self): if self.pynvmlLoaded: return f'NVIDIA Driver: {pynvml.nvmlSystemGetDriverVersion()}' # elif self.pyamdLoaded: # ver_str = create_string_buffer(256) # rocml.rocm_lib.rsmi_version_str_get(0, ver_str, 256) # return f'AMD Driver: {ver_str.value.decode()}' else: return 'Driver unknown' def deviceGetUtilizationRates(self, deviceHandle): if self.pynvmlLoaded: return pynvml.nvmlDeviceGetUtilizationRates(deviceHandle).gpu # elif self.pyamdLoaded: # return rocml.smi_get_device_utilization(deviceHandle) else: return 0 def deviceGetMemoryInfo(self, deviceHandle): if self.pynvmlLoaded: mem = pynvml.nvmlDeviceGetMemoryInfo(deviceHandle) return {'total': mem.total, 'used': mem.used} # elif self.pyamdLoaded: # mem_used = rocml.smi_get_device_memory_used(deviceHandle) # mem_total = rocml.smi_get_device_memory_total(deviceHandle) # return {'total': mem_total, 'used': mem_used} else: return {'total': 1, 'used': 1} def deviceGetTemperature(self, deviceHandle): if self.pynvmlLoaded: return pynvml.nvmlDeviceGetTemperature(deviceHandle, pynvml.NVML_TEMPERATURE_GPU) # elif self.pyamdLoaded: # temp = c_int64(0) # rocml.rocm_lib.rsmi_dev_temp_metric_get(deviceHandle, 1, 0, byref(temp)) # return temp.value / 1000 else: return 0