|
|
|
|
|
|
|
import subprocess |
|
import time |
|
import traceback |
|
from xml.dom import minidom |
|
|
|
|
|
def collect_gpu_usage(node_id): |
|
cmd = 'nvidia-smi -q -x'.split() |
|
info = None |
|
try: |
|
smi_output = subprocess.check_output(cmd) |
|
info = parse_nvidia_smi_result(smi_output) |
|
except Exception: |
|
traceback.print_exc() |
|
info = gen_empty_gpu_metric() |
|
return info |
|
|
|
|
|
def parse_nvidia_smi_result(smi): |
|
try: |
|
output = {} |
|
xmldoc = minidom.parseString(smi) |
|
gpuList = xmldoc.getElementsByTagName('gpu') |
|
output["Timestamp"] = time.asctime(time.localtime()) |
|
output["gpuCount"] = len(gpuList) |
|
output["gpuInfos"] = [] |
|
for gpuIndex, gpu in enumerate(gpuList): |
|
gpuInfo = {} |
|
gpuInfo['index'] = gpuIndex |
|
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ |
|
.getElementsByTagName('gpu_util')[0]\ |
|
.childNodes[0].data.replace("%", "").strip() |
|
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ |
|
.getElementsByTagName('memory_util')[0]\ |
|
.childNodes[0].data.replace("%", "").strip() |
|
processes = gpu.getElementsByTagName('processes') |
|
runningProNumber = len(processes[0].getElementsByTagName('process_info')) |
|
gpuInfo['activeProcessNum'] = runningProNumber |
|
|
|
gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\ |
|
.childNodes[0].data |
|
memUsage = gpu.getElementsByTagName('fb_memory_usage')[0] |
|
gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\ |
|
.childNodes[0].data.replace("MiB", "").strip() |
|
gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\ |
|
.childNodes[0].data.replace("MiB", "").strip() |
|
gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\ |
|
.childNodes[0].data.replace("MiB", "").strip() |
|
|
|
output["gpuInfos"].append(gpuInfo) |
|
except Exception: |
|
traceback.print_exc() |
|
output = {} |
|
return output |
|
|
|
|
|
def gen_empty_gpu_metric(): |
|
try: |
|
output = {} |
|
output["Timestamp"] = time.asctime(time.localtime()) |
|
output["gpuCount"] = 0 |
|
output["gpuInfos"] = [] |
|
except Exception: |
|
traceback.print_exc() |
|
output = {} |
|
return output |
|
|