Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

51.6 kB

	# mypy: allow-untyped-defs
	import dataclasses
	import importlib
	import logging
	import os

	from typing import (
	Any,
	Callable,
	Dict,
	Final,
	List,
	Mapping,
	Optional,
	Sequence,
	Set,
	Tuple,
	Union,
	)
	from typing_extensions import TypeAlias

	import torch
	import torch._C
	import torch._ops
	import torch._prims.executor
	import torch.fx
	from torch._subclasses.fake_tensor import FakeTensor
	from torch.fx._compatibility import compatibility
	from torch.fx.passes.fake_tensor_prop import FakeTensorProp
	from torch.fx.passes.operator_support import OperatorSupport
	from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
	from torch.utils import _pytree

	try:
	# Use try-except to initialize package-dependent global variables.
	import onnx
	import onnxruntime # type: ignore[import]
	from onnxruntime.capi import _pybind_state as ORTC # type: ignore[import]

	# This is not use directly in DORT but needed by underlying exporter,
	# so we still need to check if it exists.
	importlib.import_module("onnxscript")

	import torch.onnx
	import torch.onnx._internal
	import torch.onnx._internal.diagnostics
	import torch.onnx._internal.exporter
	import torch.onnx._internal.fx.decomposition_table
	import torch.onnx._internal.fx.passes
	from torch.onnx._internal.fx import fx_onnx_interpreter
	from torch.onnx._internal.fx.type_utils import (
	_TORCH_DTYPE_TO_NUMPY_DTYPE,
	_TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE,
	from_python_type_to_onnx_tensor_element_type,
	)

	_SUPPORT_ONNXRT = True
	except ImportError:
	_SUPPORT_ONNXRT = False

	__all__ = [
	"is_onnxrt_backend_supported",
	"torch_compile_backend",
	"OrtExecutionProvider",
	"OrtBackendOptions",
	"OrtBackend",
	]


	def is_onnxrt_backend_supported() -> bool:
	"""Returns ``True`` if ONNX Runtime dependencies are installed and usable
	to support TorchDynamo backend integration; ``False`` otherwise.

	Example::

	# xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
	>>> import torch
	>>> if torch.onnx.is_onnxrt_backend_supported():
	... @torch.compile(backend="onnxrt")
	... def f(x):
	... return x * x
	... print(f(torch.randn(10)))
	... else:
	... print("pip install onnx onnxscript onnxruntime")
	...
	"""
	return _SUPPORT_ONNXRT


	_dumped_onnx_model: Dict[str, int] = {}


	def _dump_onnx_model(
	model_string: bytes, graph_module: Optional[torch.fx.GraphModule] = None
	) -> str:
	"""Stores the onnx model into a file.
	The name is "{ONNXRT_DUMP_PATH}{N}.onnx"
	where N is the number of files already stored with
	this prefix.
	If graph_module is not None, the graph is stored as a string with
	the same filename except the extension (.txt).
	"""
	prefix = os.environ.get("ONNXRT_DUMP_PATH", None)
	if not prefix:
	return ""
	n = _dumped_onnx_model.get(prefix, -1) + 1
	filename = f"{prefix}{n}.onnx"
	with open(filename, "wb") as f:
	f.write(model_string)
	_dumped_onnx_model[prefix] = n
	if graph_module is not None:
	filename_txt = f"{prefix}{n}.txt"
	with open(filename_txt, "w", encoding="utf-8") as f:
	f.write(str(graph_module.graph))
	return filename


	def _infer_default_eps() -> Sequence[str]:
	# TODO: select a good default based on the capabilities of the host
	# e.g. DML on Windows, etc.
	return ["CPUExecutionProvider"]


	def _nvtx_range_push(name: str):
	"""If PyTorch is installed with CUDA support, this starts NVTX range.

	Check torch.cuda.nvtx.range_push's document for more details.
	"""
	if torch.cuda.is_available():
	torch.cuda.nvtx.range_push(name)


	def _nvtx_range_pop():
	"""If PyTorch is installed with CUDA support, this terminates NVTX range.

	Check torch.cuda.nvtx.range_pop's document for more details.
	"""
	if torch.cuda.is_available():
	torch.cuda.nvtx.range_pop()


	def _get_ort_device_type(device_type: str):
	if device_type == "cuda":
	return ORTC.OrtDevice.cuda()
	if device_type == "cpu":
	return ORTC.OrtDevice.cpu()
	# ort pytorch device is mapped to NPU OrtDevice type
	if device_type == "maia":
	return ORTC.OrtDevice.npu()
	raise ValueError("Unsupported device type: " + device_type)


	logger = logging.getLogger(__name__)
	# Uncomment the following lines to print out development info.
	# logging.basicConfig(level=logging.WARNING)
	# logger.setLevel(logging.WARNING)


	class OrtOperatorSupport(OperatorSupport):
	"""Operator support for ONNXRuntime backend.

	It has two-level of support decision. One is via support_dict and the other one
	is via extra_support_dict. The logic of using support_dict is implemented in
	OrtOperatorSupport and extra_support_dict is used by OperatorSupport.is_node_supported.
	"""

	def __init__(self, support_dict: Set[Any], extra_support_dict: Dict[str, Any]):
	# Use extra_support_dict[op_name] = None to indicate
	# we support op_name with all input types. Otherwise,
	# see support_dict (type: SupportDict) in operator_support.py
	# for specifying supported types.
	super().__init__(extra_support_dict)
	self._onnx_support_dict = support_dict

	def is_node_supported(
	self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
	) -> bool:
	# OperatorSupport.is_node_supported returns True for non-callable nodes.
	# Since ORT can't execute them, we return False here to override the base
	# behavior.
	if node.op not in CALLABLE_NODE_OPS:
	return False
	# This is the and the only place to decide if aten op is supported.
	if node.op == "call_function" and node.target in self._onnx_support_dict:
	logger.info(
	"support_dict supports node.target: %s (type: %s)",
	node.target,
	type(node.target),
	)
	return True
	# If node.target is not in support_dict, we still want to check if torch.jit.script
	# can convert it to ONNX equivalence. Let's use base mechanism to do this.
	# See extra_support_dict for supported ops.
	if super().is_node_supported(submodules, node):
	logger.info(
	"extra_support_dict supports node.target: %s (type: %s)",
	node.target,
	type(node.target),
	)
	return True
	logger.warning(
	"support_dict and extra_support_dict don't support node.target: %s (type: %s)",
	node.target,
	type(node.target),
	)
	return False


	def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
	"""
	In torch.fx.Graph, placeholder is a special assignment node. If it's not
	executed in the beginning, it could overwrite values computed by upstream
	nodes.
	"""

	graph = graph_module.graph
	placeholders = []
	first_not_placeholder = None
	for node in graph.nodes:
	if node.op == "placeholder":
	placeholders.append(node)
	if first_not_placeholder is None and node.op != "placeholder":
	first_not_placeholder = node
	if first_not_placeholder is None:
	return
	for placeholder in placeholders:
	first_not_placeholder.prepend(placeholder)


	def _infer_ep_from_device(*args) -> Tuple[str, ...]:
	"""Return the first valid device (i.e., GPU or CPU) in argument list."""
	eps = []
	for arg in args:
	if hasattr(arg, "device"):
	device = arg.device
	if device.type == "cuda":
	eps.append("CUDAExecutionProvider")
	elif device.type == "cpu":
	eps.append("CPUExecutionProvider")
	return tuple(eps)


	def _extract_graph_module_inputs(graph_module: torch.fx.GraphModule) -> Tuple[Any, ...]:
	placeholders = []
	for node in graph_module.graph.nodes:
	if node.op == "placeholder":
	if hasattr(node, "meta") and "val" in node.meta:
	assert isinstance(node.meta["val"], torch.Tensor)
	placeholders.append(node)
	return tuple(placeholders)


	def _extract_graph_module_outputs(graph_module: torch.fx.GraphModule) -> Any:
	"""Collect "val" fields from outputs metadata in this torch.fx.GraphModule."""
	for node in graph_module.graph.nodes:
	if node.op == "output":
	# Output node is unique. Let's retrieve output values from
	# this node's input list. And then just return.
	return node.args[0]
	raise ValueError("No output node found in this torch.fx.GraphModule.")


	def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str, ...]:
	"""Return the all valid devices (i.e., GPU or CPU) among outputs of this torch.fx.GraphModule."""
	flattened_output_args, _ = _pytree.tree_flatten(
	_extract_graph_module_outputs(graph_module)
	)
	# Output arguments with example value (type: torch.Tensor) in the `graph_module`.
	selected_output_args = [
	output_arg.meta["val"]
	for output_arg in flattened_output_args
	# output_arg must have tensor for its device information.
	# Otherwise, skip it.
	if (hasattr(output_arg, "meta") and "val" in output_arg.meta)
	]
	return _infer_ep_from_device(*selected_output_args)


	def _sort_eps(eps: Tuple[str, ...]) -> Tuple[str, ...]:
	"""Sort execution providers in eps based on pre-set priority."""

	def get_execution_provider_priority(ep: str) -> int:
	if ep == "CPUExecutionProvider":
	# Lowest priority.
	return 2
	if ep == "CUDAExecutionProvider":
	# Higher priority than CPU but lower than
	# other specialized EPs.
	return 1
	# Highest priority.
	return 0

	unique_eps = set(eps)
	return tuple(sorted(unique_eps, key=get_execution_provider_priority, reverse=True))


	def _get_onnx_devices(
	values: Tuple[
	Union[
	torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
	],
	...,
	]
	) -> Tuple["ORTC.OrtDevice", ...]:
	def _device_id_or_zero(device_id: int) -> int:
	return device_id or 0

	def _map_tensor_or_sym_to_device(
	value: Union[
	torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
	],
	) -> int:
	if isinstance(value, torch.Tensor):
	return ORTC.OrtDevice(
	_get_ort_device_type(value.device.type),
	ORTC.OrtDevice.default_memory(),
	_device_id_or_zero(value.device.index),
	)
	elif isinstance(
	value, (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool)
	):
	return ORTC.OrtDevice(
	_get_ort_device_type("cpu"), ORTC.OrtDevice.default_memory(), 0
	)
	else:
	raise ValueError("Unsupported value type: " + str(type(value)))

	if len(values) > 0:
	ort_devices = tuple(_map_tensor_or_sym_to_device(value) for value in values)
	return ort_devices
	else:
	return (_map_tensor_or_sym_to_device(1),)


	def _get_ortvalues_from_torch_tensors(
	tensors: Tuple[torch.Tensor, ...], devices: Tuple["ORTC.OrtDevice", ...]
	) -> Tuple[torch.Tensor, ...]:
	ortvalues = ORTC.OrtValueVector()
	ortvalues.reserve(len(tensors))
	dtypes = []
	shapes = []
	data_ptrs = []

	for tensor in tensors:
	dtypes.append(_TORCH_DTYPE_TO_NUMPY_DTYPE[tensor.dtype])
	shapes.append(tensor.size())
	data_ptrs.append(tensor.data_ptr())
	ortvalues.push_back_batch(tensors, data_ptrs, dtypes, shapes, devices)
	return ortvalues


	def _to_real_tensor(tensor: FakeTensor) -> torch.Tensor:
	if tensor.is_sparse:
	raise ValueError("sparse tensor is not yet supported.")
	out = torch.empty(tensor.size(), dtype=tensor.dtype, device=tensor.device)
	return out


	def _adjust_scalar_from_fx_to_onnx(
	dynamo_value: Union[
	torch.Tensor,
	int,
	float,
	bool,
	],
	value_info: "onnx.ValueInfoProto", # type: ignore[name-defined]
	) -> torch.Tensor:
	"""Helper function to wrap PyTorch variables as torch.Tensor"""
	if (
	isinstance(dynamo_value, torch.Tensor)
	and len(value_info.type.tensor_type.shape.dim) == 0
	and dynamo_value.shape == (1,)
	):
	# ONNX expect a scalar with empty shape.
	# In contrast, PyTorch usually allows implicit
	# conversion between shape=() and shape=(1,).
	#
	# Below, PyTorch's shape (1,) is reshaped to ().
	return torch.squeeze(dynamo_value)
	elif isinstance(dynamo_value, int):
	return torch.tensor(dynamo_value, dtype=torch.int64)
	elif isinstance(dynamo_value, float):
	return torch.tensor(dynamo_value, dtype=torch.float32)
	elif isinstance(dynamo_value, bool):
	return torch.tensor(dynamo_value, dtype=torch.bool)
	else:
	assert isinstance(dynamo_value, torch.Tensor)
	return dynamo_value.contiguous()


	def _adjust_scalar_from_onnx_to_fx(
	tensor: torch.Tensor,
	prim_value: Union[
	torch.Tensor,
	torch.SymInt,
	int,
	torch.SymFloat,
	float,
	torch.SymBool,
	bool,
	],
	) -> Union[torch.Tensor, int, float, bool,]:
	"""Helper function to wrap ORT-produced torch.Tensor as PyTorch variables"""
	assert isinstance(tensor, torch.Tensor), "ORT's output must be tensor."
	if isinstance(
	prim_value,
	(torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool),
	):
	# Convert tensor back to scalar to match Dynamo's expectation.
	return tensor.item()
	return tensor


	def _run_onnx_session_with_ortvaluevector(
	sess: "onnxruntime.InferenceSession",
	input_names: Tuple[str, ...],
	inputs: Tuple[torch.Tensor, ...],
	input_devices: Tuple["ORTC.OrtDevice", ...],
	output_names: Tuple[str, ...],
	outputs: Tuple[torch.Tensor, ...],
	output_devices: Tuple["ORTC.OrtDevice", ...],
	preallocate_output: bool,
	input_value_infos: Tuple["onnx.ValueInfoProto", ...], # type: ignore[name-defined]
	normalized_prim_outputs: Tuple[
	Union[
	torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
	],
	...,
	],
	) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
	_nvtx_range_push("contiguous")
	inputs = tuple(
	_adjust_scalar_from_fx_to_onnx(arg, value_info)
	for arg, value_info in zip(inputs, input_value_infos)
	)
	_nvtx_range_pop()

	_nvtx_range_push("push_back_batch")
	ort_inputs = _get_ortvalues_from_torch_tensors(inputs, input_devices)

	# preallocate output pytorch Tensors and use the buffers affined to the torch device for the output ortvalue.
	# Because the output ortvalue is not allocated and owned by ort, it does not need to convert the output ortvalue
	# to torch Tensor transferring the ownership.
	if preallocate_output:
	pth_outputs = tuple(
	_to_real_tensor(t) if isinstance(t, FakeTensor) else t for t in outputs
	)
	ort_outputs = _get_ortvalues_from_torch_tensors(pth_outputs, output_devices)
	else:
	ort_outputs = ORTC.OrtValueVector()
	_nvtx_range_pop()

	_nvtx_range_push("run_with_ortvaluevector")
	run_options = onnxruntime.RunOptions()
	run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
	sess.run_with_ortvaluevector(
	run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices
	)
	_nvtx_range_pop()

	# Post-processing step:
	# wrap ORT's outputs to the schema represented by
	# `prim_output` (obtained by running the original
	# torch.fx.GraphModule).
	if preallocate_output:
	# Profile the ORT-to-PyTorch type cast below
	_nvtx_range_push("after run_with_ortvaluevector")
	# Outputs are stored on pre-allocated torch.Tensors' memory,
	# so this case doesn't need to convert ORTValue to torch.Tensor.
	pth_outputs = tuple(
	_adjust_scalar_from_onnx_to_fx(onnx_output, prim_output) # type: ignore[misc]
	for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
	)
	_nvtx_range_pop()
	return pth_outputs
	else:
	# Profile the two ORT-to-PyTorch type casts below
	_nvtx_range_push("after run_with_ortvaluevector")
	# Map ORTValue to torch.Tensor.
	pth_outputs = onnxruntime.training.ortmodule._utils._ortvalues_to_torch_tensor(
	ort_outputs
	)
	# Change some torch.Tensor to int, float, bool.
	pth_outputs = tuple(
	_adjust_scalar_from_onnx_to_fx(onnx_output, prim_output) # type: ignore[misc]
	for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
	)
	_nvtx_range_pop()
	return pth_outputs


	def _run_onnx_session_with_fetch(
	sess: "onnxruntime.InferenceSession",
	input_names: Tuple[str, ...],
	inputs: Tuple[torch.Tensor, ...],
	input_devices: Tuple["ORTC.OrtDevice", ...],
	output_names: Tuple[str, ...],
	outputs: Tuple[torch.Tensor, ...],
	output_devices: Tuple["ORTC.OrtDevice", ...],
	preallocate_output: bool,
	input_value_infos: Tuple["onnx.ValueInfoProto", ...], # type: ignore[name-defined]
	normalized_prim_outputs: Tuple[
	Union[
	torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
	],
	...,
	],
	) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
	inputs = tuple(
	_adjust_scalar_from_fx_to_onnx(arg, value_info)
	for arg, value_info in zip(inputs, input_value_infos)
	)
	feed = {
	name: onnxruntime.OrtValue.ortvalue_from_numpy(tensor.cpu().numpy())
	for name, tensor in zip(input_names, inputs)
	}
	ort_outputs = sess.run(output_names, feed)
	pth_outputs = tuple(
	_adjust_scalar_from_onnx_to_fx(
	torch.from_numpy(value),
	prim_output,
	)
	for value, prim_output in zip(ort_outputs, normalized_prim_outputs)
	)
	return pth_outputs


	class OrtExecutionInfoPerSession:
	"""Information required to execute torch.fx.GraphModule using onnxruntime.InferenceSession"""

	def __init__(
	self,
	session: "onnxruntime.InferenceSession",
	input_names: Tuple[str, ...],
	input_value_infos: Tuple["onnx.ValueInfoProto", ...], # type: ignore[name-defined]
	output_names: Tuple[str, ...],
	output_value_infos: Tuple["onnx.ValueInfoProto", ...], # type: ignore[name-defined]
	input_devices: Tuple["ORTC.OrtDevice", ...],
	output_devices: Tuple["ORTC.OrtDevice", ...],
	example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor],
	):
	# Carrier of ONNX model and its executor.
	self.session: onnxruntime.InferenceSession = session
	# For the ONNX model stored in self.session, self.input_names[i] is the
	# name of the i-th positional input.
	self.input_names: Tuple[str, ...] = input_names
	# self.input_name[i]'s type information is stored in self.input_value_infos[i].
	self.input_value_infos: Tuple[onnx.ValueInfoProto, ...] = input_value_infos # type: ignore[name-defined]
	# Similar to self.input_names, but for outputs.
	self.output_names: Tuple[str, ...] = output_names
	# Similar to self.input_value_infos but for outputs.
	self.output_value_infos: Tuple[onnx.ValueInfoProto, ...] = output_value_infos # type: ignore[name-defined]
	# For the ONNX model stored in self.session, self.input_devices[i] is the
	# i-th positional input's device.
	self.input_devices: Tuple["ORTC.OrtDevice", ...] = input_devices
	# Similar to self.input_devices, but for outputs.
	self.output_devices: Tuple["ORTC.OrtDevice", ...] = output_devices
	# This is the outputs of executing the original torch.fx.GraphModule with example inputs
	# (i.e., args passed into OrtBackend._ort_acclerated_call).
	self.example_outputs: Union[
	Tuple[torch.Tensor, ...], torch.Tensor
	] = example_outputs

	def is_supported(self, *args):
	# Compare the args and the input schema in ONNX model and
	# return the first match.
	if len(args) != len(self.input_value_infos):
	return False
	for arg, value_info in zip(args, self.input_value_infos):
	if not isinstance(arg, (torch.Tensor, float, int)):
	return False

	# Check Python scalars such as int, float, and bool.
	if isinstance(arg, (int, float, bool)):
	# Map, e.g., float to onnx.TensorProto.FLOAT.
	onnx_dtype = from_python_type_to_onnx_tensor_element_type(type(arg))
	if onnx_dtype != value_info.type.tensor_type.elem_type:
	return False
	if len(value_info.type.tensor_type.shape.dim) != 0:
	return False
	continue

	# Check tensor.
	onnx_dtype = _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE[arg.dtype]
	if onnx_dtype != value_info.type.tensor_type.elem_type:
	return False
	for dim, onnx_dim in zip(arg.shape, value_info.type.tensor_type.shape.dim):
	if isinstance(dim, int) and (
	onnx_dim.dim_value == dim or onnx_dim.dim_param
	):
	continue
	elif isinstance(dim, torch.SymInt) and onnx_dim.dim_param:
	continue
	else:
	return False
	return True


	@dataclasses.dataclass
	class OrtExecutionInfoForAllGraphModules:
	def __init__(self):
	# All sessions (and their related information) created by exporting the same GraphModule
	# with different inputs.
	self.execution_info_per_graph_module: Dict[
	torch.fx.GraphModule, List[OrtExecutionInfoPerSession]
	] = {}

	def search_reusable_session_execution_info(
	self, graph_module: torch.fx.GraphModule, *args
	):
	if graph_module not in self.execution_info_per_graph_module:
	return None
	# All execution information for ONNX models exported from the same `graph_module`
	# with different inputs.
	candidates = self.execution_info_per_graph_module[graph_module]

	for candidate in candidates:
	if candidate.is_supported(*args):
	# Returns the first session that accepts this input schema.
	return candidate
	# No reusable session found.
	return None

	def cache_session_execution_info(
	self, graph_module: torch.fx.GraphModule, info: OrtExecutionInfoPerSession
	):
	if graph_module not in self.execution_info_per_graph_module:
	self.execution_info_per_graph_module[graph_module] = [info]
	else:
	self.execution_info_per_graph_module[graph_module].append(info)


	OrtExecutionProvider: TypeAlias = Union[str, Tuple[str, Mapping[str, Any]]]
	"""Either the name of an ONNX Runtime execution provider as a string or
	a 2-tuple of the name and a dictionary of execution provider options.

	Examples::

	>>> "CPUExecutionProvider"

	>>> ("CUDAExecutionProvider", {"device_id": 3})

	"""


	@dataclasses.dataclass(frozen=True)
	@compatibility(is_backward_compatible=False)
	class OrtBackendOptions:
	"""Options for constructing an ``OrtBackend``, the ONNX Runtime
	backend (``"onnxrt"``) for ``torch.compile``.

	Example::

	>>> @torch.compile(
	... backend="onnxrt",
	... options=torch.onnx._OrtBackendOptions(...),
	... )
	... def ort_function(x):
	... return x ** x
	"""

	preferred_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
	"""An optional sequence of execution providers to be prioritized ahead of any
	execution providers that may be inferred (see ``infer_execution_providers``).
	"""

	infer_execution_providers: bool = True
	"""Whether to infer an execution provider from ``torch.device`` bound to inputs or found in the graph."""

	default_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
	"""The default fallback execution providers. If not specified, one will be
	be selected based on the host environment (most likely ``"CPUExecutionProvider"``).
	"""

	# preallocate_output allows for allocating output torch Tensor buffers and feeding them to InferenceSession
	# in order to avoid internal allocation of output buffers in InferenceSession.
	# If output ortvalue returned from InferenceSession is allocated internally,
	# it needs to be converted to torch Tensor for return, and the torch Tensor should hold the ownership.
	# When a custom torch device is used with a custom aten allocator, the conversion from ortvalue to torch Tensor
	# should be supported, which is currently done through dlpack. Note that dlpack might not support a custom torch device.
	# It can be avoided by allowing for preallocation for output buffers allocated by a custom aten allocator,
	# and use the preallocated output buffers for InferenceSession not holding any ownership for them.
	# TODO(wschin): Make it to inference session level flag.
	# See https://github.com/pytorch/pytorch/issues/106869.
	preallocate_output: bool = False
	"""If ``True``, allocate memory for ONNX Runtime's outputs on the PyTorch side."""

	use_aot_autograd: bool = True
	"""Whether to wrap the ``OrtBackend`` with TorchDynamo's aot_autograd backend
	to support training (i.e., backward graphs are also sent to ``OrtBackend``).

	Symbolic execution is used to capture the forward pass and backward passes as a single graph.
	Then, a selected graph partition algorithm (``min_cut_rematerialization_partition``) is used
	to split the entire graph into forward sub-graph and backward sub-graph. Finally, both
	sub-graphs are compiled by ``OrtBackend``.
	"""

	export_options: Optional["torch.onnx.ExportOptions"] = None
	"""Options for the TorchDynamo-based ONNX exporter used by the ``OrtBackend``."""

	ort_session_options: Optional["onnxruntime.SessionOptions"] = None
	"""Options for the ``onnxruntime.InferenceSession`` used by the ``OrtBackend``."""

	pre_ort_model_transforms: Optional[ # type: ignore[name-defined]
	Sequence[Callable[["onnx.ModelProto"], None]]
	] = None
	"""A list of graph transforms to be applied to the ONNX model before it
	is fed to ONNXRuntime's InferenceSession."""


	@compatibility(is_backward_compatible=False)
	class OrtBackend:
	"""A backend compiles (sub-)graphs in torch.fx.GraphModule to onnxruntime.InferenceSession calls.

	The compiler entry point is OrtBackend.compile, which
	1. partitions the original graph into supported sub-graphs (type: torch.fx.GraphModule) and unsupported
	sub-graphs.
	2. For each supported sub-graph, it replaces its _wrapped_call function with _ort_accelerated_call.
	3. Inside _ort_accelerated_call, it creates onnxruntime.InferenceSession and calls it to execute the sub-graph.
	"""

	def __init__(self, options: Optional[OrtBackendOptions] = None):
	self._options: Final = OrtBackendOptions() if options is None else options

	# options.export_options contains information shared between exporter and DORT.
	# For example, they should use the same decomposition table when
	# 1. capturing FX graph in torch.compile (see how we create aot_ort in register_backend.py)
	# 2. call exporter's API to convert `torch.fx.GraphModule` to ONNX model
	# (see onnxfunction_dispatcher passed to FxOnnxInterpreter.run below).
	#
	# Convert user-facing option to internal option used by ONNX exporter
	# to access required information.
	# Some useful fields:
	# - Decomposition table for decomposing FX operators in exporter is
	# self._resolved_onnx_exporter_options.decomposition_table.
	# - self._resolved_onnx_exporter_options.onnx_registry records what
	# aten/prim ops are supported by exporter and their exporters (type: callable).
	self._resolved_onnx_exporter_options = (
	torch.onnx._internal.exporter.ResolvedExportOptions(
	torch.onnx.ExportOptions()
	if self._options.export_options is None
	else self._options.export_options
	)
	)

	# Given DORT's computation flow:
	# 1. OrtOperatorSupport uses support_dict and extra_support_dict to select operators
	# and send them to DORT.
	# 2. Then, DORT exports the selected sub-graphs into ONNX.
	# 3. Finally DORT calls ORT to do the computation.
	# OrtOperatorSupport and create_onnx_friendly_decomposition_table(...)
	# must use the same support_dict. If the support_dict here contains something not
	# supported by exporter, exporter will fails in step 2 since the selected graphs may
	# contains unsupported operators such as aten::_who_you_are.
	# This restriction is automatically done since DORT and exporter shares the same
	# self._resolved_onnx_exporter_options.
	support_dict = torch.onnx._internal.fx.decomposition_table._create_onnx_supports_op_overload_table(
	self._resolved_onnx_exporter_options.onnx_registry
	)

	extra_support_dict: Dict[str, Any] = {
	"getattr": None,
	# To send operator.getitem to ORT, add the corresponding string
	# recognized by PyTorch's OperatorSupport class.
	"_operator.getitem": None,
	# To send operator.mul to ORT, add the corresponding string
	# recognized by PyTorch's OperatorSupport class.
	"_operator.mul": None,
	"_operator.add": None,
	"_operator.sub": None,
	}

	self._supported_ops = OrtOperatorSupport(support_dict, extra_support_dict)
	# TODO(wschin): this is a naive implementation of cache without proper guard
	# See https://github.com/pytorch/pytorch/issues/106868.
	self._partitioner_cache: Dict[torch.fx.GraphModule, torch.fx.GraphModule] = {}
	# Conceptually, this filed is a 2-layer dictionary
	# GraphModule 0
	# ONNX Model 0 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
	# ONNX Model 1
	# ...
	# GraphModule 1
	# ONNX Model 2 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
	# ONNX Model 3
	# ...
	# ...
	# , which caches all previous compilation result so that we can reuse them.
	# ONNX Model 0 and 1 are exported from the same GraphModule 0 but with different inputs
	# (e.g., tensors with different ranks). GraphModule 0 and GraphModule 1 are different
	# graphs captured by Dynamo and sent to OrtBackend.compile.
	self._all_ort_execution_info = OrtExecutionInfoForAllGraphModules()

	self._assert_allclose_to_baseline = False

	self.execution_count = 0

	# Function which invokes ORT do to the real computation.
	self.run = (
	_run_onnx_session_with_ortvaluevector
	if hasattr(ORTC.OrtValueVector, "push_back_batch")
	else _run_onnx_session_with_fetch
	)

	def _select_eps(
	self, graph_module: torch.fx.GraphModule, *args
	) -> Sequence[Tuple[str, Mapping[str, Any]]]:
	inferred_eps: Tuple[str, ...] = tuple()
	if self._options.infer_execution_providers:
	if eps_from_args := _infer_ep_from_device(*args):
	# If user feeds CUDA tensor as input argument,
	# we want to use CUDA EP.
	# Thus, `eps_from_args` (deduced from input arguments)
	# has highest priority.
	inferred_eps = eps_from_args
	elif eps_from_graph_module := _infer_ep_from_graph_module(graph_module):
	# If there is no EP in input arguments, we deduce EP from
	# graph_module's outputs. Those outputs may come from
	# FakeTensorProp or Dynamo's built-in symbolic shape inference.
	inferred_eps = eps_from_graph_module

	selected_eps = []

	for ep in (
	*(self._options.preferred_execution_providers or []),
	*_sort_eps(inferred_eps),
	*(self._options.default_execution_providers or _infer_default_eps()),
	):
	if isinstance(ep, str):
	ep = (ep, {})
	elif isinstance(ep, tuple) and ep[1] is None:
	ep = (ep[0], {})
	if ep is not None and ep not in selected_eps:
	selected_eps.append(ep)

	return selected_eps

	def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, args, *kwargs):
	"""This function replaces GraphModule._wrapped_call in compiled model.

	The _wrapped_call is the underlying implementation of forward method. Replacing
	it means we delegate the computation to _ort_acclerated_call and therefore
	onnxruntime.InferenceSession.
	"""
	cached_execution_info_per_session = (
	self._all_ort_execution_info.search_reusable_session_execution_info(
	graph_module, *args
	)
	)
	if cached_execution_info_per_session:
	onnx_session = cached_execution_info_per_session.session
	input_names = cached_execution_info_per_session.input_names
	output_names = cached_execution_info_per_session.output_names
	input_value_infos = cached_execution_info_per_session.input_value_infos
	output_value_infos = cached_execution_info_per_session.output_value_infos
	input_devices = cached_execution_info_per_session.input_devices
	output_devices = cached_execution_info_per_session.output_devices
	prim_outputs = cached_execution_info_per_session.example_outputs
	else:
	# It's first time seeing such as graph. Let's make a new session
	# (type: onnxruntime.InferenceSession) for it.

	graph_module = torch.onnx._internal.fx.passes.MovePlaceholderToFront(
	self._resolved_onnx_exporter_options.diagnostic_context,
	graph_module,
	).run()
	# Generate reference outputs. They are used to indicate output
	# tensors' types and devices when calling ORT.
	#
	# WARNING: The downstream code should not change prim_outputs and
	# this backend should always produces output with schema identical to prim_outputs'.

	if self._resolved_onnx_exporter_options.dynamic_shapes:
	# No pre-allocation when dynamic shape is enabled.
	self.preallocate_output = False
	extracted_outputs = _extract_graph_module_outputs(graph_module)

	def maybe_map_to_meta_val(value):
	if hasattr(value, "meta") and "val" in value.meta:
	# Select outputs with "val" information. Without "val",
	# it's not possible access output_arg.meta["val"].device.
	return value.meta["val"]
	else:
	return value

	prim_outputs = _pytree.tree_map(
	maybe_map_to_meta_val, extracted_outputs
	)
	else:
	try:
	prim_outputs = FakeTensorProp(graph_module).propagate(
	args, *kwargs
	)
	except Exception:
	logger.warning("FakeTensorProb failed for %s", graph_module)
	# When FakeTensorProp fails, it is not possible to preallocate output buffers
	# because the output shapes are not inferred.
	self.preallocate_output = False

	# rethrow FakeTensorProb failure because it is not yet currently handled.
	raise

	# Create the object to iterate through the nodes in graph one-by-one
	# and calls the corresponding ONNX exporter for each node.
	fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter(
	diagnostic_context=self._resolved_onnx_exporter_options.diagnostic_context
	)
	# Cast FX variables if they will result schema-mismatch when searching
	# for ONNX operator. E.g., add(double_tensor, int_tensor) is fine in PyTorch,
	# but ONNX expects add(double_tensor, double_tensor).
	graph_module = torch.onnx._internal.fx.passes.InsertTypePromotion(
	self._resolved_onnx_exporter_options.diagnostic_context, graph_module
	).run()
	# Start the per-node exporting process. It's conceptually a for loop
	# scanning through the nodes in the graph.
	exported = fx_interpreter.run(
	fx_graph_module=graph_module,
	onnxfunction_dispatcher=self._resolved_onnx_exporter_options.onnxfunction_dispatcher,
	op_level_debug=self._resolved_onnx_exporter_options.op_level_debug,
	)
	# Convert the exported result to ONNX ModelProto.
	onnx_model = exported.to_model_proto(
	opset_version=self._resolved_onnx_exporter_options.onnx_registry.opset_version,
	)

	try:
	from onnxscript import optimizer # type: ignore[import]
	from onnxscript.rewriter import ( # type: ignore[import]
	onnxruntime as ort_rewriter, # type: ignore[import]
	)

	onnx_model = optimizer.optimize(onnx_model)
	onnx_model = ort_rewriter.rewrite(onnx_model)
	except ImportError:
	logger.warning(
	"ONNXScript optimizer is not available. Skipping optimization. "
	"Please `pip install onnxscript -U` to enable post-export optimization."
	)

	# Modify ONNX model using pre-registered graph transforms.
	# They are in-place modifications for avoiding unnecessary
	# copy of ONNX initializers.
	if self._options.pre_ort_model_transforms:
	for transform in self._options.pre_ort_model_transforms:
	transform(onnx_model)

	onnx_model_bytes = onnx_model.SerializeToString()
	if os.environ.get("ONNXRT_DUMP_PATH", None):
	# If not empty, environment variable ONNXRT_DUMP_PATH defined the path
	# where generated onnx files should be stored.
	# This module keeps a global variables keeping track of the
	# stored models.
	# If ONNXRT_DUMP_PATH="dumped/dumped_model_"
	# The first file name will be 'dumped/dumped_model_0.onnx'.
	# For every dumped model, a text file 'dumped/dumped_model_0.txt'
	# is created as well to contain the string representing the graph_module.
	_dump_onnx_model(onnx_model_bytes, graph_module=graph_module)

	# Initialize a ORT session to execute this ONNX model.
	# Note that TorchDynamo assumes all inputs/outputs are on the
	# same device, but it's subject to change (very likely with
	# dynamic shape support), so we add execution providers
	# based on the logic in _select_eps: (explicitly preferred EPs,
	# EPs inferred from inputs or graph, and the fallback default EP)/
	#
	# TODO(wschin): enable external allocators.
	# See https://github.com/pytorch/pytorch/issues/106867
	onnx_session = onnxruntime.InferenceSession(
	path_or_bytes=onnx_model_bytes,
	sess_options=self._options.ort_session_options,
	providers=self._select_eps(graph_module, *args),
	)

	# Cache ORT session. It's reused for the same "graph_module".
	# Generate ONNX model and extract its input and output names.
	input_names = tuple(input.name for input in onnx_model.graph.input)
	output_names = tuple(output.name for output in onnx_model.graph.output)
	input_devices = _get_onnx_devices(args)
	# Cache devices for inputs and outputs. They are used to invoke
	# ORT session. Output devices indicate where (e.g., GPU or CPU)
	# to store outputs
	if isinstance(prim_outputs, tuple):
	output_devices = _get_onnx_devices(prim_outputs)
	else:
	output_devices = _get_onnx_devices((prim_outputs,))

	input_value_infos = tuple(input for input in onnx_model.graph.input)
	output_value_infos = tuple(output for output in onnx_model.graph.output)

	execution_info_per_session = OrtExecutionInfoPerSession(
	session=onnx_session,
	input_names=input_names,
	input_value_infos=input_value_infos,
	output_names=output_names,
	output_value_infos=output_value_infos,
	input_devices=input_devices,
	output_devices=output_devices,
	example_outputs=prim_outputs,
	)

	self._all_ort_execution_info.cache_session_execution_info(
	graph_module, execution_info_per_session
	)

	self.execution_count += 1

	# ORT always returns a tuple of outputs. If the original output is a tensor,
	# ORT output's first element must be extracted and returned. Otherwise, type
	# mismatch may happen in downstream computation.
	is_single_tensor_output = isinstance(prim_outputs, torch.Tensor)
	normalized_prim_outputs = (
	(prim_outputs,) if is_single_tensor_output else prim_outputs
	)
	assert isinstance(normalized_prim_outputs, tuple)
	assert all(
	isinstance(elem, (torch.Tensor, torch.SymInt, int))
	for elem in normalized_prim_outputs
	)

	_nvtx_range_push("run_onnx_session_with_ortvaluevector")
	onnx_outputs = self.run(
	onnx_session,
	input_names,
	args,
	input_devices,
	output_names,
	normalized_prim_outputs,
	output_devices,
	self._options.preallocate_output,
	input_value_infos,
	normalized_prim_outputs,
	)
	_nvtx_range_pop()

	if self._assert_allclose_to_baseline:
	# Compute baseline.
	baseline_outputs = torch._prims.executor.execute(
	graph_module, *args, executor="aten"
	)
	normalized_baseline_ouptuts = (
	(baseline_outputs,) if is_single_tensor_output else baseline_outputs
	)
	# Ensure every output tensor is close to the corresponding baseline.
	for onnx_output, baseline_output in zip(
	onnx_outputs, normalized_baseline_ouptuts
	):
	torch.testing.assert_close(onnx_output, baseline_output)
	return onnx_outputs[0] if is_single_tensor_output else onnx_outputs

	def compile(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphModule:
	# Deferred import since CapabilityBasedPartitioner is not decorated with
	# @compatibility; importing it at the module level will result in the test
	# failing: pytest test/test_fx.py -k test_public_api_surface
	# because this module is imported into torch.onnx.
	from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner

	# FX graph based partitioning based on ONNX supported ops.
	# Given a graph module
	# GraphModule0
	# node_0
	# node_1
	# node_2
	# node_3
	# node_4
	# If only node_2 is not supported by ONNX, this graph module will be partitioned into
	# GraphModule0
	# GraphModule1
	# node_0
	# node_1
	# node_2
	# GraphModule2
	# node_3
	# node_4
	# by calling CapabilityBasedPartitioner.partition_and_fuse.
	# Then, GraphModule1's and GraphModule2's forward method (GraphModule._wrapped_call)
	# will be replaced by OrtBackend._ort_accelerated_call to delegate computation to ORT.
	if graph_module in self._partitioner_cache:
	partitioned_prim_graph_module = self._partitioner_cache[graph_module]
	else:
	prim_graph_module = graph_module
	partitioner = CapabilityBasedPartitioner(
	prim_graph_module,
	self._supported_ops,
	allows_single_node_partition=True,
	)
	partitioned_prim_graph_module = partitioner.partition_and_fuse()
	self._partitioner_cache[graph_module] = partitioned_prim_graph_module

	# Overriding fused_module's __call__() function with ort_acclerated_call()
	# This loop goes through all graph partitions (each of them is an ONNX-representable graph)
	# and override their _wrapped_call function with _ort_accelerated_call.
	# Inside _ort_accelerated_call, the partition's graph is exported into ONNX and executed by ORT.
	for node in partitioned_prim_graph_module.graph.nodes:
	# TODO(wschin): use a better way to identify fused submodule
	# See https://github.com/pytorch/pytorch/issues/106872.
	if node.op == "call_module" and "fused_" in node.name:
	fused_module = getattr(partitioned_prim_graph_module, node.name)
	# self.ort_acclerated_call is responsible for exporting graph to ONNX,
	# creating ORT session, and running ORT session.
	fused_module._wrapped_call = self._ort_acclerated_call

	return partitioned_prim_graph_module

	def __call__(
	self, graph_module: torch.fx.GraphModule, args
	) -> torch.fx.GraphModule:
	"""If ``OrtBackendOptions.use_aot_autograd`` is ``True``, the `auto_autograd` compiler
	will be invoked, wrapping this ``OrtBackend`` instance's ``compile`` method. Otherwise,
	the ``compile`` method is invoked directly."""
	if self._options.use_aot_autograd:
	from functorch.compile import min_cut_rematerialization_partition

	from torch._dynamo.backends.common import aot_autograd

	return aot_autograd(
	fw_compiler=self.compile,
	partition_fn=min_cut_rematerialization_partition,
	decompositions=self._resolved_onnx_exporter_options.decomposition_table,
	)(graph_module, args)

	return self.compile(graph_module, args)

	__instance_cache_max_count: Final = 8
	__instance_cache: Final[List["OrtBackend"]] = []

	@staticmethod
	def get_cached_instance_for_options(
	options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
	) -> "OrtBackend":
	"""Returns a possibly cached instance of an ``OrtBackend``. If an existing
	backend was created previously through this function with the same options,
	it will be returned. Otherwise a new backend will be created, cached, and
	returned.

	Note: if ``options`` sets ``ort_session_options``, a new ``OrtBackend``
	will always be returned, since ``onnxruntime.SessionOptions`` cannot
	participate in caching."""

	def reusable(a: OrtBackendOptions, b: OrtBackendOptions):
	if (
	a.preferred_execution_providers != b.preferred_execution_providers
	or a.infer_execution_providers != b.infer_execution_providers
	or a.default_execution_providers != b.default_execution_providers
	or a.preallocate_output != b.preallocate_output
	or a.use_aot_autograd != b.use_aot_autograd
	or a.pre_ort_model_transforms != b.pre_ort_model_transforms
	):
	return False

	# onnxruntime.SessionOptions is a pybind11 object, cannot be pickled,
	# and holds too much potential state to reasonably check manually;
	# ort_session_options is provided at all, the backend does not participate
	# in caching.
	if a.ort_session_options is not None or b.ort_session_options is not None:
	return False

	if a.export_options is b.export_options:
	return True

	# Similarly, some objects in ExportOptions are too stateful to use for
	# caching. We should revisit this.
	if a.export_options is not None and b.export_options is not None:
	return (
	a.export_options.dynamic_shapes == b.export_options.dynamic_shapes
	and a.export_options.op_level_debug
	== b.export_options.op_level_debug
	and a.export_options.diagnostic_options
	== b.export_options.diagnostic_options
	and a.export_options.onnx_registry is b.export_options.onnx_registry
	and a.export_options.fake_context is b.export_options.fake_context
	)

	# We can't account for how the two option sets may differ, so it's not safe to reuse.
	return False

	if not isinstance(options, OrtBackendOptions):
	options = OrtBackendOptions(**(options or {}))

	backend = next(
	(b for b in OrtBackend.__instance_cache if reusable(b._options, options)),
	None,
	)

	if backend is None:
	assert (
	len(OrtBackend.__instance_cache) < OrtBackend.__instance_cache_max_count
	), (
	f"No more than {OrtBackend.__instance_cache_max_count} instances of "
	f"{OrtBackend} allowed. Please instantiate `{OrtBackend}` explicitly "
	"to pass to `torch.compile`. "
	"See https://github.com/pytorch/pytorch/pull/107973#discussion_r1306144795 "
	"for discussion."
	)
	OrtBackend.__instance_cache.append(backend := OrtBackend(options))

	return backend

	@staticmethod
	def clear_cached_instances():
	OrtBackend.__instance_cache.clear()

	@staticmethod
	def get_cached_instances():
	return tuple(OrtBackend.__instance_cache)


	@compatibility(is_backward_compatible=False)
	def torch_compile_backend(
	graph_module: torch.fx.GraphModule,
	args,
	*,
	options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
	):
	return OrtBackend.get_cached_instance_for_options(options)(graph_module, args)