File size: 9,680 Bytes

82ea528

import torch
import pynvml
import comfy.model_management
from ..core import logger
# from ctypes import *
# from pyrsmi import rocml

class CGPUInfo:
    """
    This class is responsible for getting information from GPU (ONLY).
    """
    cuda = False
    pynvmlLoaded = False
    # pyamdLoaded = False
    # anygpuLoaded = False
    cudaAvailable = False
    torchDevice = 'cpu'
    cudaDevice = 'cpu'
    cudaDevicesFound = 0
    switchGPU = True
    switchVRAM = True
    switchTemperature = True
    gpus = []
    gpusUtilization = []
    gpusVRAM = []
    gpusTemperature = []

    def __init__(self):
        try:
            pynvml.nvmlInit()
            self.pynvmlLoaded = True
            logger.info('Pynvml (Nvidia) initialized.')
        except Exception as e:
            logger.error('Could not init pynvml (Nvidia).' + str(e))

        # if not self.pynvmlLoaded:
        #     try:
        #         rocml.smi_initialize()
        #         self.pyamdLoaded = True
        #         logger.info('Pyrsmi (AMD) initialized.')
        #     except Exception as e:
        #         logger.error('Could not init pyrsmi (AMD).' + str(e))

        # self.anygpuLoaded = self.pynvmlLoaded or self.pyamdLoaded
        self.anygpuLoaded = self.pynvmlLoaded

        try:
            self.torchDevice = comfy.model_management.get_torch_device_name(comfy.model_management.get_torch_device())
        except Exception as e:
            logger.error('Could not pick default device.' + str(e))

        # ZLUDA Check, self.torchDevice has 'ZLUDA' in it.
        if 'zluda' in self.torchDevice or 'ZLUDA' in self.torchDevice or 'Zluda' in self.torchDevice:
            logger.warn('ZLUDA detected. GPU monitoring will be disabled.')
            self.anygpuLoaded = False
            # self.pyamdLoaded = False
            self.pynvmlLoaded = False

        if self.anygpuLoaded and self.deviceGetCount() > 0:
            self.cudaDevicesFound = self.deviceGetCount()

            logger.info(f"GPU/s:")

            # for simulate multiple GPUs (for testing) interchange these comments:
            # for deviceIndex in range(3):
            #     deviceHandle = pynvml.nvmlDeviceGetHandleByIndex(0)
            for deviceIndex in range(self.cudaDevicesFound):
                deviceHandle = self.deviceGetHandleByIndex(deviceIndex)

                gpuName = self.deviceGetName(deviceHandle, deviceIndex)

                logger.info(f"{deviceIndex}) {gpuName}")

                self.gpus.append({
                    'index': deviceIndex,
                    'name': gpuName,
                })

                # same index as gpus, with default values
                self.gpusUtilization.append(True)
                self.gpusVRAM.append(True)
                self.gpusTemperature.append(True)

            self.cuda = True
            logger.info(self.systemGetDriverVersion())
        else:
            logger.warn('No GPU with CUDA detected.')

        self.cudaDevice = 'cpu' if self.torchDevice == 'cpu' else 'cuda'
        self.cudaAvailable = torch.cuda.is_available()

        if self.cuda and self.cudaAvailable and self.torchDevice == 'cpu':
            logger.warn('CUDA is available, but torch is using CPU.')

    def getInfo(self):
        logger.debug('Getting GPUs info...')
        return self.gpus

    def getStatus(self):
        # logger.debug('CGPUInfo getStatus')
        gpuUtilization = -1
        gpuTemperature = -1
        vramUsed = -1
        vramTotal = -1
        vramPercent = -1

        gpuType = ''
        gpus = []

        if self.cudaDevice == 'cpu':
            gpuType = 'cpu'
            gpus.append({
                'gpu_utilization': -1,
                'gpu_temperature': -1,
                'vram_total': -1,
                'vram_used': -1,
                'vram_used_percent': -1,
            })
        else:
            gpuType = self.cudaDevice

            if self.anygpuLoaded and self.cuda and self.cudaAvailable:
                # for simulate multiple GPUs (for testing) interchange these comments:
                # for deviceIndex in range(3):
                #     deviceHandle = self.deviceGetHandleByIndex(0)
                for deviceIndex in range(self.cudaDevicesFound):
                    deviceHandle = self.deviceGetHandleByIndex(deviceIndex)

                    gpuUtilization = -1
                    vramPercent = -1
                    vramUsed = -1
                    vramTotal = -1
                    gpuTemperature = -1

                    # GPU Utilization
                    if self.switchGPU and self.gpusUtilization[deviceIndex]:
                        try:
                            gpuUtilization = self.deviceGetUtilizationRates(deviceHandle)
                        except Exception as e:
                            if str(e) == "Unknown Error":
                                logger.error('For some reason, pynvml is not working in a laptop with only battery, try to connect and turn on the monitor')
                            else:
                                logger.error('Could not get GPU utilization.' + str(e))

                            logger.error('Monitor of GPU is turning off (not on UI!)')
                            self.switchGPU = False

                    # VRAM
                    if self.switchVRAM and self.gpusVRAM[deviceIndex]:
                        # Torch or pynvml?, pynvml is more accurate with the system, torch is more accurate with comfyUI
                        memory = self.deviceGetMemoryInfo(deviceHandle)
                        vramUsed = memory['used']
                        vramTotal = memory['total']

                        # device = torch.device(gpuType)
                        # vramUsed = torch.cuda.memory_allocated(device)
                        # vramTotal = torch.cuda.get_device_properties(device).total_memory

                        # check if vramTotal is not zero or None
                        if vramTotal and vramTotal != 0:
                            vramPercent = vramUsed / vramTotal * 100

                    # Temperature
                    if self.switchTemperature and self.gpusTemperature[deviceIndex]:
                        try:
                            gpuTemperature = self.deviceGetTemperature(deviceHandle)
                        except Exception as e:
                            logger.error('Could not get GPU temperature. Turning off this feature. ' + str(e))
                            self.switchTemperature = False

                    gpus.append({
                        'gpu_utilization': gpuUtilization,
                        'gpu_temperature': gpuTemperature,
                        'vram_total': vramTotal,
                        'vram_used': vramUsed,
                        'vram_used_percent': vramPercent,
                    })

        return {
            'device_type': gpuType,
            'gpus': gpus,
        }

    def deviceGetCount(self):
        if self.pynvmlLoaded:
            return pynvml.nvmlDeviceGetCount()
        # elif self.pyamdLoaded:
        #     return rocml.smi_get_device_count()
        else:
            return 0

    def deviceGetHandleByIndex(self, index):
        if self.pynvmlLoaded:
            return pynvml.nvmlDeviceGetHandleByIndex(index)
        # elif self.pyamdLoaded:
        #     return index
        else:
            return 0

    def deviceGetName(self, deviceHandle, deviceIndex):
        if self.pynvmlLoaded:
            gpuName = 'Unknown GPU'

            try:
                gpuName = pynvml.nvmlDeviceGetName(deviceHandle)
                try:
                    gpuName = gpuName.decode('utf-8', errors='ignore')
                except AttributeError as e:
                    pass

            except UnicodeDecodeError as e:
                gpuName = 'Unknown GPU (decoding error)'
                print(f"UnicodeDecodeError: {e}")

            return gpuName
        # elif self.pyamdLoaded:
        #     return rocml.smi_get_device_name(deviceIndex)
        else:
            return ''

    def systemGetDriverVersion(self):
        if self.pynvmlLoaded:
            return f'NVIDIA Driver: {pynvml.nvmlSystemGetDriverVersion()}'
        # elif self.pyamdLoaded:
        #     ver_str = create_string_buffer(256)
        #     rocml.rocm_lib.rsmi_version_str_get(0, ver_str, 256)
        #     return f'AMD Driver: {ver_str.value.decode()}'
        else:
            return 'Driver unknown'

    def deviceGetUtilizationRates(self, deviceHandle):
        if self.pynvmlLoaded:
            return pynvml.nvmlDeviceGetUtilizationRates(deviceHandle).gpu
        # elif self.pyamdLoaded:
        #     return rocml.smi_get_device_utilization(deviceHandle)
        else:
            return 0

    def deviceGetMemoryInfo(self, deviceHandle):
        if self.pynvmlLoaded:
            mem = pynvml.nvmlDeviceGetMemoryInfo(deviceHandle)
            return {'total': mem.total, 'used': mem.used}
        # elif self.pyamdLoaded:
        #     mem_used = rocml.smi_get_device_memory_used(deviceHandle)
        #     mem_total = rocml.smi_get_device_memory_total(deviceHandle)
        #     return {'total': mem_total, 'used': mem_used}
        else:
            return {'total': 1, 'used': 1}

    def deviceGetTemperature(self, deviceHandle):
        if self.pynvmlLoaded:
            return pynvml.nvmlDeviceGetTemperature(deviceHandle, pynvml.NVML_TEMPERATURE_GPU)
        # elif self.pyamdLoaded:
        #     temp = c_int64(0)
        #     rocml.rocm_lib.rsmi_dev_temp_metric_get(deviceHandle, 1, 0, byref(temp))
        #     return temp.value / 1000
        else:
            return 0