LINC-BIT's picture
Upload 1912 files
b84549f verified
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import subprocess
import time
import traceback
from xml.dom import minidom
def collect_gpu_usage(node_id):
cmd = 'nvidia-smi -q -x'.split()
info = None
try:
smi_output = subprocess.check_output(cmd)
info = parse_nvidia_smi_result(smi_output)
except Exception:
traceback.print_exc()
info = gen_empty_gpu_metric()
return info
def parse_nvidia_smi_result(smi):
try:
output = {}
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
output["Timestamp"] = time.asctime(time.localtime())
output["gpuCount"] = len(gpuList)
output["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList):
gpuInfo = {}
gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('gpu_util')[0]\
.childNodes[0].data.replace("%", "").strip()
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('memory_util')[0]\
.childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['activeProcessNum'] = runningProNumber
gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
.childNodes[0].data
memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
.childNodes[0].data.replace("MiB", "").strip()
output["gpuInfos"].append(gpuInfo)
except Exception:
traceback.print_exc()
output = {}
return output
def gen_empty_gpu_metric():
try:
output = {}
output["Timestamp"] = time.asctime(time.localtime())
output["gpuCount"] = 0
output["gpuInfos"] = []
except Exception:
traceback.print_exc()
output = {}
return output