|
import os
|
|
import time
|
|
import argparse
|
|
import subprocess
|
|
import platform
|
|
from typing import Optional, Tuple, Dict
|
|
import threading
|
|
import numpy as np
|
|
from onnx import load, ModelProto
|
|
import onnxruntime as ort
|
|
|
|
os.environ["XLNX_ENABLE_CACHE"] = "0"
|
|
os.environ["PATH"] += (
|
|
os.pathsep + f"{os.environ['CONDA_PREFIX']}\\Lib\\site-packages\\flexmlrt\\lib"
|
|
)
|
|
|
|
XRT_SMI_PATH = "C:\\Windows\\System32\\AMD\\xrt-smi.exe"
|
|
|
|
ONNX_DTYPE_TO_NP = {
|
|
"tensor(float)": np.float32,
|
|
"tensor(float16)": np.float16,
|
|
"tensor(int64)": np.int64,
|
|
"tensor(int32)": np.int32,
|
|
"tensor(uint16)": np.uint16,
|
|
"tensor(int16)": np.int16,
|
|
"tensor(uint8)": np.uint8,
|
|
"tensor(int8)": np.int8,
|
|
}
|
|
|
|
|
|
def generate_rand_data_from_onnx(
|
|
model: ModelProto,
|
|
lowest_int_val: Optional[int],
|
|
highest_int_val: Optional[int],
|
|
) -> Dict[str, np.ndarray]:
|
|
|
|
|
|
sess = ort.InferenceSession(
|
|
model.SerializePartialToString(), providers=["CPUExecutionProvider"]
|
|
)
|
|
|
|
inps = {}
|
|
|
|
for inp in sess.get_inputs():
|
|
inp_shapes = list(inp.shape)
|
|
for inp_shape in inp_shapes:
|
|
assert isinstance(
|
|
inp_shape, int
|
|
), f"Found dynamic axes: {inp_shape}. Please freeze."
|
|
np_type = ONNX_DTYPE_TO_NP[inp.type]
|
|
if np.issubdtype(np_type, np.integer):
|
|
iinfo = np.iinfo(np_type)
|
|
if lowest_int_val is None:
|
|
lowest_int_val = iinfo.min
|
|
if highest_int_val is None:
|
|
lowest_int_val = iinfo.max
|
|
inps[inp.name] = np.random.randint(
|
|
lowest_int_val, highest_int_val, size=tuple(inp_shapes), dtype=np_type
|
|
)
|
|
else:
|
|
inps[inp.name] = np.random.rand(*inp_shapes).astype(np_type)
|
|
|
|
return inps
|
|
|
|
|
|
def configure_npu_power(p_mode: Optional[str] = None) -> Tuple[int, str, str]:
|
|
"""
|
|
Configures the NPU power state using xrt-smi.exe.
|
|
|
|
Args:
|
|
p_mode (string, optional): The desired power mode (p-mode).
|
|
If None, displays current status.
|
|
Refer to xrt-smi documentation for valid p-modes.
|
|
Returns:
|
|
tuple: (return_code, stdout, stderr) from the subprocess call.
|
|
return_code is an integer, stdout and stderr are strings.
|
|
Raises:
|
|
OSError: If xrt-smi.exe is not found.
|
|
"""
|
|
|
|
if platform.system() != "Windows":
|
|
return (-1, "xrt-smi.exe is only available on Windows.", "")
|
|
|
|
try:
|
|
if p_mode is not None:
|
|
command = [XRT_SMI_PATH, "configure", "--pmode", str(p_mode)]
|
|
else:
|
|
command = [
|
|
XRT_SMI_PATH,
|
|
"examine",
|
|
"--report",
|
|
"platform",
|
|
]
|
|
|
|
process = subprocess.Popen(
|
|
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
)
|
|
stdout, stderr = process.communicate()
|
|
return_code = process.returncode
|
|
|
|
if return_code != 0:
|
|
print(f"Error executing xrt-smi.exe: {stderr}")
|
|
|
|
return return_code, stdout, stderr
|
|
|
|
except FileNotFoundError:
|
|
raise OSError("xrt-smi.exe not found.")
|
|
except Exception as e:
|
|
print(f"An unexpected error occurred: {e}")
|
|
return -1, "", str(e)
|
|
|
|
|
|
def main(
|
|
model_file: str,
|
|
vaip_config: str,
|
|
cache_path: str,
|
|
device: str,
|
|
pmode: str,
|
|
warmup_runs: int,
|
|
inferences: int,
|
|
lowest_int_value: Optional[int],
|
|
highest_int_value: Optional[int],
|
|
threads: int,
|
|
):
|
|
|
|
assert os.path.exists(model_file)
|
|
|
|
assert threads >= 1
|
|
|
|
if device == "cpu":
|
|
ort_session = ort.InferenceSession(
|
|
model_file,
|
|
providers=["CPUExecutionProvider"],
|
|
)
|
|
|
|
elif device == "npu":
|
|
assert os.path.exists(vaip_config)
|
|
assert os.path.exists(cache_path)
|
|
cache_dir = os.path.dirname(os.path.abspath(cache_path))
|
|
cache_key = os.path.basename(cache_path)
|
|
print(f"Using cache directory {cache_dir} with key {cache_key}")
|
|
|
|
ret_code, stdout, stderr = configure_npu_power(pmode)
|
|
print(stdout)
|
|
if ret_code != 0:
|
|
print("Error configuring npu power mode.")
|
|
print(stderr)
|
|
|
|
sess_options = ort.SessionOptions()
|
|
ort_session = ort.InferenceSession(
|
|
model_file,
|
|
providers=["VitisAIExecutionProvider"],
|
|
sess_options=sess_options,
|
|
provider_options=[
|
|
{
|
|
"config_file": vaip_config,
|
|
"cacheDir": cache_dir,
|
|
"cacheKey": cache_key,
|
|
}
|
|
],
|
|
)
|
|
|
|
elif device == "igpu":
|
|
ort_session = ort.InferenceSession(
|
|
model_file,
|
|
providers=["DmlExecutionProvider"],
|
|
provider_options=[{"device_id": 2}],
|
|
)
|
|
|
|
onnx_inputs = generate_rand_data_from_onnx(
|
|
load(model_file),
|
|
lowest_int_val=lowest_int_value,
|
|
highest_int_val=highest_int_value,
|
|
)
|
|
|
|
|
|
for _ in range(warmup_runs):
|
|
ort_session.run(None, onnx_inputs)
|
|
|
|
def run_inference(runs, session, inputs):
|
|
for _ in range(runs):
|
|
session.run(None, inputs)
|
|
|
|
latencies = []
|
|
num_threads = threads
|
|
threads_list = []
|
|
inferences_per_thread = inferences // num_threads
|
|
remainder = inferences % num_threads
|
|
print(f"inferences per thread: {inferences_per_thread}, remainder: {remainder}")
|
|
start = time.perf_counter()
|
|
for i in range(num_threads):
|
|
num_runs = inferences_per_thread + (1 if i < remainder else 0)
|
|
thread = threading.Thread(
|
|
target=run_inference, args=(num_runs, ort_session, onnx_inputs)
|
|
)
|
|
threads_list.append(thread)
|
|
thread.start()
|
|
|
|
for thread in threads_list:
|
|
thread.join()
|
|
|
|
end = time.perf_counter()
|
|
latencies.append((end - start) / inferences)
|
|
print(f"Latencies: {latencies}")
|
|
print(f"Benchmark results averaged over {inferences} inferences targeting {device}")
|
|
print("Average latency (ms): ", round(np.mean(latencies) * 1e3, 3))
|
|
print("Average throughput (inf/s): ", round(1 / np.mean(latencies), 3))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--pmode",
|
|
type=str,
|
|
choices=["default", "powersaver", "balanced", "performance", "turbo"],
|
|
default="default",
|
|
help="Desired power mode.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"onnx_model",
|
|
type=str,
|
|
help="Provide the onnx model file.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--vai-config",
|
|
type=str,
|
|
help="Path to the vaip configuration json file.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--cache-path",
|
|
required=False,
|
|
type=str,
|
|
help="Path to the saved compilation directory.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--device",
|
|
required=False,
|
|
type=str,
|
|
default="npu",
|
|
choices=["npu", "cpu", "igpu"],
|
|
help="Select the device to run the measurements on.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--warmup-runs",
|
|
required=False,
|
|
default=10,
|
|
type=int,
|
|
help="The number of inferences to run before capturing performance.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--inferences",
|
|
required=False,
|
|
default=100,
|
|
type=int,
|
|
help="The number of inferences to average performance over.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--lowest-int-value",
|
|
required=False,
|
|
type=int,
|
|
help="Lowest value the rng will produce if the model has an integer input type.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--highest-int-value",
|
|
required=False,
|
|
type=int,
|
|
help="Highest value the rng will produce if the model has an integer input type.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--threads",
|
|
default=1,
|
|
required=False,
|
|
type=int,
|
|
help="The number of threads that are used to run the inferences.",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
main(
|
|
args.onnx_model,
|
|
args.vai_config,
|
|
args.cache_path,
|
|
args.device,
|
|
args.pmode,
|
|
args.warmup_runs,
|
|
args.inferences,
|
|
args.lowest_int_value,
|
|
args.highest_int_value,
|
|
args.threads,
|
|
) |